\documentstyle[psfig, fullpage, 11pt]{article}
\def\fd{\mathrel{\rightarrow}}
\def\mvd{\hbox{$\rightarrow$\hskip-5pt$\rightarrow$}}
\def\ans{\par\smallskip\noindent{\bf Answer:~}}
\def\tjoin#1{\mathrel{\hbox{$\hbox{$\bowtie$}\atop#1$}}}
\def\join{\mathrel{\bowtie}}
\def\sjoin{\mathrel{\raise1pt\hbox{\vrule
height5pt depth0pt\hskip-1.5pt$>$\hskip
-3pt$<$}}}
\begin{document}
\begin{center}
{\large\bf PART B (BASIC)}
\end{center}
\begin{enumerate}
\item[B.1]
(5 pts.)
Convert the following Datalog rule into an expression of relational
algebra.
\begin{verbatim}
p(X,Y) :- r(X,Z,Z) & s(Z,Y,Y)
\end{verbatim}
It is OK to write a sequence of assignments to temporaries, leading to
your final algebraic expression, if you prefer not to write a single
expression.
\item[B.2]
(7 pts.)
{\em Armstrong's Axioms} for functional dependencies are the following
three: If $Y\subseteq X$, then $X\fd Y$ ({\em Reflexivity}); If $X\fd Y$,
then $XZ\fd YZ$ ({\em Augmentation}); If $X\fd Y$ and $YZ\fd W$, then
$XZ\fd W$ ({\em Pseudotransitivity}).
Use these axioms to prove the {\em Union Rule}\/: If $X\fd Y$ and $X\fd
Z$, then $X\fd YZ$.
\item[B.3]
(8 pts.)
A market-basket dataset has five items: Bread, Spaghetti, Rice, Potatoes, and
Couscous.
Half the baskets are filled with exactly one item, and the item is equally
likely to be any of the five items.
The other half of the baskets are filled by choosing whether or not to
buy each of the five items, independently.
With probability 50\%, Bread is added to the basket.
With probability 40\%, Spaghetti is added to the basket.
With probability 30\%, Rice is added to the basket.
With probability 20\%, Potatoes is added to the basket.
With probability 10\%, Couscous is added to the basket.
\begin{enumerate}
\item
(2 pts.)
What is the support of $\{$Rice$\}$ (as a fraction of the number of
baskets)?
\item
(4 pts.)
What is the association rule of the form $\{A\}\fd B$, where $A$ and $B$
are distinct single items, that has the highest confidence? What is that
confidence?
\item
(2 pts.)
What is the association rule of the form $\{A,B\}\fd C$, where $A$, $B$,
and $C$ are distinct
single items, that has the highest confidence? What is that
confidence?
\end{enumerate}
\pagebreak
\begin{center}
{\large\bf PART B (ADVANCED)}
\end{center}
\item[B.4]
(10 pts.)
Consider the following two conjunctive queries with arithmetic.
The head {\tt panic} is a 0-ary predicate, i.e., a propositional
variable.
\begin{center}\begin{tabular}{l l}
$Q_1$: & {\tt panic :- a(X,Y) \& a(Y,X) \& X < Y}\\
$Q_2$: & {\tt panic :- a(A,B) \& a(B,A) \& A > B}\\
\end{tabular}\end{center}
\begin{enumerate}
\item
(5 pts.)
What containments, if any, hold between $Q_1$ and $Q_2$?
Either prove containment using any method, or give counterexamples.
\item
(5 pts.)
If the heads of $Q_1$ and $Q_2$ were changed from {\tt panic} to $p(X)$
and $p(A)$, respectively, i.e.:
\begin{center}\begin{tabular}{l l}
$Q_1$: & {\tt p(X) :- a(X,Y) \& a(Y,X) \& X < Y}\\
$Q_2$: & {\tt p(A) :- a(A,B) \& a(B,A) \& A > B}\\
\end{tabular}\end{center}
how would your answer to (a) change?
\end{enumerate}
\item[B.5]
(10 pts.)
A view-centric integration system has one global predicate $p$, and the
following two views:
\begin{verbatim}
v(X,Y) :- p(X,Y) & p(Y,Z)
w(U,V) :- p(W,U) & p(U,V)
\end{verbatim}
The query to be answered is:
\begin{verbatim}
q(A,B) :- p(A,C) & p(C,B)
\end{verbatim}
\begin{enumerate}
\item
(5 pts.)
What are all the minimal solutions for $q$?
\item
(5 pts.)
Choose one of your solutions.
Explain why it is a solution, and
why it is minimal.
\end{enumerate}
\item[B.6]
(10 pts.)
There is a family of propositional Datalog programs, $D_2,D_3,\ldots$
defined as follows. $D_N$ has the rules:
\begin{center}\begin{tabular}{l l}
{\tt $p_i$ :- NOT $q_i$} & for $i=1,2,\ldots,N$\\
{\tt $q_i$ :- NOT $p_i$} & for $i=1,2,\ldots,N$\\
{\tt $p_{i+1}$ :- $p_i$} & for $i=1,2,\ldots,N-1$\\
\end{tabular}\end{center}
We wish to determine the stable models of $D_N$ for any $N$.
Answer the following:
\begin{enumerate}
\item
(4 pts.)
Can a stable model include both $p_i$ and $q_i$?
Can it include neither?
Justify your answers.
\item
(3 pts.)
Can a stable model include $p_i$ but not include $p_{i+1}$, if $i$ is
between 1 and $N-1$?
Justify your answer carefully.
\item
(3 pts.)
Describe all the stable models for $D_N$.
\end{enumerate}
\item[B.7]
(10 pts.)
A market-basket data set involves 1,000,000 items.
There are 1,000,000,000 (infrequent)
pairs that occur exactly once, and 10,000,000
(frequent) pairs that occur exactly 10 times each.
No other pairs occur at all.
The support threshold $s$ is 10.
We wish to run the PCY (Park-Chen-Yu) algorithm on this data set, and we
hope that the majority of the buckets will prove not to be frequent
buckets. You may assume that:
\begin{itemize}
\item
Frequent pairs will each hash into a
different bucket.
\item
Infrequent pairs will divide as evenly as possible among the buckets.
\item
Integers require 4 bytes, and may be used for item ID's as well as
counts.
\end{itemize}
In the following, answers within 10\% of the exact answer will be considered correct.
\begin{enumerate}
\item
(5 pts.)
What
is the minimum main memory we may use so that most of the buckets
will be infrequent (have counts less than $s$ = 10)?
Specifically, under the given assumptions we have as few frequent buckets as possible.
Show your calculation for partial credit, if you wish.
\item
(5 pts.)
If all items prove to be frequent on pass 1, and we use the number of buckets on pass 1 that your answer to (a)
implies, how much memory do we need
on pass~2 to count all the candidate pairs?
\end{enumerate}
\end{enumerate}
\end{document}