day1/session3.tex
changeset 184 9efa777af2e2
parent 178 8a3a9d98fa84
child 185 e59ab9ab1a89
--- a/day1/session3.tex	Tue Oct 27 12:30:24 2009 +0530
+++ b/day1/session3.tex	Tue Oct 27 12:50:19 2009 +0530
@@ -127,13 +127,19 @@
 %% \end{frame}
 
 \begin{frame}
+  \frametitle{More on data processing}
+  \begin{block}{}
+    What do we do if we want to draw Pie charts for the data in a huge data file?
+  \end{block}
+\end{frame}
+
+
+\begin{frame}
   \frametitle{Statistical Analysis and Parsing}
   Read the data supplied in \emph{sslc1.txt} and obtain the following statistics:
   \begin{itemize}
-    \item Average total marks scored in each region
-    \item Subject wise average score of each region
-    \item \alert{??Subject wise average score for all regions combined??}
-    \item Find the subject wise standard deviation of scores for each region
+    \item Draw a pie chart representing the number of students who scored more than 90\% in Science per region.
+    \item Draw a pie chart representing the number of students who scored more than 90\% per subject(All regions combined).
   \end{itemize}
 \end{frame}
 
@@ -142,7 +148,6 @@
   Machinery Required -
   \begin{itemize}
     \item File reading and parsing
-    \item NumPy arrays - sum by rows and sum by coloumns
     \item Dictionaries
   \end{itemize}
 \end{frame}
@@ -183,128 +188,115 @@
 \begin{frame}[fragile]
   \frametitle{Dictionary - Building parsed data}
   \begin{itemize}
-    \item Let the parsed data be stored in dictionary \typ{data}
-    \item \begin{lstlisting}
-data = {}  # is an empty dictionary
-\end{lstlisting}
-    \item Index of a dictionary is called a \emph{key}
-    \item \emph{Keys} of \typ{data} are strings - region codes
-    \item Value of a \emph{key} can be any Python object
+    \item Let the parsed data be stored in list of dictionaries.
+    \item d = \{\} is an empty dictionary
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
-  \frametitle{Dictionary - Building parsed data...}
+  \frametitle{Dictionary - Building parsed data}
+\begin{lstlisting}
+ninety_percents = [{}, {}, {}, {}, {}]
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Dictionary - Building parsed data}
   \begin{itemize}
-    \item In this problem let the value of a \emph{key} be another dictionary.
-    \item This dictionary contains:
-    \begin{itemize}
-      \item 'marks': A \emph{List} of \emph{Lists} containing all marks
-      \item 'total': A \emph{List} of total marks of each student
-      \item 'P': Number of passes
-      \item 'F': Number of failures
-      \item 'W': Number of withdrawls
-    \end{itemize}
+    \item Index of a dictionary is called a \emph{key}
+    \item \emph{Keys} of these dictionaries are strings - region codes
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Dictionary - Building parsed data \ldots}
-  \small
+  \begin{itemize}
+    \item Value of a \emph{key} can be any legal Python value
+    \item In this problem let the value of a \emph{key} be another an integer
+    \item This dictionary contains:
+  \end{itemize}
+'region code': Number of students who scored more than 90\% in this region for this subject
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Building parsed data \ldots}
   \begin{lstlisting}
-data = {}
+from pylab import *
+
+ninety_percents = [{}, {}, {}, {}, {}]
+
 for record in open('sslc1.txt'):
+    record = record.strip()
     fields = record.split(';')
-    if fields[0] not in data:
-        data[fields[0]] = {
-            'marks': [],
-            'total': [],
-            'P': 0,
-            'F': 0,
-            'W': 0
-            }
+
+    region_code = fields[0].strip()
   \end{lstlisting}
 \end{frame}
 
 \begin{frame}[fragile]
-  \frametitle{Dictionary - Building parsed data \ldots}
+  \frametitle{Building parsed data \ldots}
+  \small
   \begin{lstlisting}
-marks = []
-for field in fields[3:8]:
-    score_str = field.strip()
-    score = 0 if score_str == 'AA'
-        or score_str == 'AAA'
-        or score_str == ''
-        else int(score_str)
-    marks.append(score)
+for i, field in enumerate(fields[3:8]):
 
-data[fields[0]]['marks'].append(marks)
-  \end{lstlisting}
-\end{frame}
+    if region_code not in ninety_percents[i]:
+        ninety_percents[i][region_code] = 0
+
+    score_str = field.strip()
 
-\begin{frame}[fragile]
-  \frametitle{Dictionary - Building parsed data \ldots}
-  \begin{lstlisting}
-total = 0 if score_str == 'AA'
-    or score_str == 'AAA'
-    or score_str == ''
-    else int(fields[8])
-data[fields[0]]['total'].append(total)
+    score = 0 if score_str == 'AA' else 
+                         int(score_str)
+    if score > 90:
+        ninety_percents[i][region_code] += 1
   \end{lstlisting}
 \end{frame}
 
 \begin{frame}[fragile]
-  \frametitle{Dictionary - Building parsed data \ldots}
+  \frametitle{Consolidating data}
   \begin{lstlisting}
-pfw_key = fields[9]
-    or fields[10]
-    or 'F'
-data[fields[0]][pfw_key] += 1
-  \end{lstlisting}
-\end{frame}
-
-\begin{frame}[fragile]
-  \frametitle{NumPy arrays}
-  \centerline{\alert{But I lied!?!?!?}}
-\end{frame}
-
-\begin{frame}[fragile]
-  \frametitle{Calculations}
-  \begin{lstlisting}
-for k in data:
-    data[k]['marks'] = array(
-        data[k]['marks'])
-    data[k]['total'] = array(
-        data[k]['total'])
+subj_total = []
+for subject in ninety_percents:
+    subj_total.append(sum(
+         subject.values()))
   \end{lstlisting}
 \end{frame}
 
 \begin{frame}[fragile]
-  \frametitle{Calculations}
+  \frametitle{Pie charts}
   \small
   \begin{lstlisting}
-    data[k]['avg'] = average(
-        data[k]['total'])
-    marks = data[k]['marks']
-    sub_avg = average(marks, axis=1)
-    sub_std = sqrt(sum(square(
-        sub_avg[:,newaxis] - marks), axis=0) /
-        len(marks))
-    data[k]['sub_avg'] = sub_avg
-    data[k]['sub_std'] = sub_std
+figure(1)
+pie(ninety_percents[4].values(), 
+    labels=ninety_percents[1].keys())
+title('Students scoring 90% and above 
+      in science by region')
+savefig('/tmp/science.png')
+  \end{lstlisting}
+\begin{columns}
+    \column{5.25\textwidth}
+    \hspace*{1.1in}
+\includegraphics[height=2in, interpolate=true]{data/science}
+    \column{0.8\textwidth}
+\end{columns}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Pie charts}
+  \begin{lstlisting}
+figure(2)
+pie(subj_total, labels=['English',
+    'Hindi', 'Maths', 'Science',
+    'Social'])
+title('Students scoring more than
+      90% by subject(All regions
+      combined).')
+savefig('/tmp/all_regions.png')
   \end{lstlisting}
 \end{frame}
 
 \begin{frame}[fragile]
-  \frametitle{New Concepts}
-  \begin{itemize}
-   \item Dictionaries
-   \item Slicing lists
-   \item New type of conditional
-   \item NumPy arrays
-   \item Slicing NumPy arrays
-   \item NumPy array functions - square, average, sqrt
-  \end{itemize}
+  \frametitle{Pie charts}
+  \includegraphics[height=3in, interpolate=true]{data/all_regions}
 \end{frame}
 
 \begin{frame}[fragile]
@@ -427,4 +419,11 @@
 \end{lstlisting}
 \end{frame}
 
+\begin{frame}[fragile]
+  \frametitle{What did we learn?}
+  \begin{itemize}
+   \item Dictionaries
+   \item Drawing pie charts
+  \end{itemize}
+\end{frame}
 \end{document}