Workshop materials: changeset 122:73374e1ae4f3

--- a/day1/session3.tex	Thu Oct 15 11:23:01 2009 +0530
+++ b/day1/session3.tex	Thu Oct 15 11:23:47 2009 +0530
@@ -252,6 +252,136 @@
 \end{itemize}
 \end{frame}
 
+\begin{frame}
+  \frametitle{Statistical Analysis and Parsing}
+  Read the data supplied in \emph{sslc1.txt} and obtain the following statistics:
+  \begin{itemize}
+    \item Average total marks scored in each region
+    \item Subject wise average score of each region
+    \item ??Subject wise average score for all regions combined??
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Statistical Analysis and Parsing \ldots}
+  Machinery Required -
+  \begin{itemize}
+    \item File reading and parsing
+    \item NumPy arrays - sum by rows and sum by coloumns
+    \item Dictionaries
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{File reading and parsing}
+  Understanding the structure of sslc1.txt
+  \begin{itemize}
+    \item Each line in the file, i.e each row of a file is a single record.
+    \item Each record corresponds to a record of a single student
+    \item Each record consists of several fields separated by a ';'
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{File reading and parsing \ldots}
+  Each record consists of:
+  \begin{itemize}
+    \item Region Code
+    \item Roll Number
+    \item Name
+    \item Marks of 5 subjects
+    \item Total marks
+    \item Pass (P)
+    \item Withdrawn (W)
+    \item Fail (F)
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{File reading and parsing \ldots}
+  \begin{lstlisting}
+for record in open('sslc1.txt'):
+    fields = record.split(';')
+  \end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Dictionary}
+  \begin{itemize}
+    \item lists index: 0 \ldots n
+    \item dictionaries index using any hashable objects
+    \item d = \{ ``Hitchhiker's guide'' : 42, ``Terminator'' : ``I'll be back''\}
+    \item d[``Terminator''] => ``I'll be back''
+    \item ``Terminator'' is called the key of \typ{d}
+    \item ``I'll be back'' is called the value of the key ``Terminator''
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Dictionary - Building parsed data}
+  \begin{itemize}
+    \item Let the parsed data be stored in dictionary \typ{data}
+    \item Keys of \typ{data} are strings - region codes
+    \item Value of the key is another dictionary.
+    \item This dictionary contains:
+    \begin{itemize}
+      \item 'marks': A list of NumPy arrays
+      \item 'total': Total marks of each student
+      \item 'P': Number of passes
+      \item 'F': Number of failures
+      \item 'W': Number of withdrawls
+    \end{itemize}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Dictionary - Building parsed data \ldots}
+  \small
+  \begin{lstlisting}
+data = {}
+for record in open('sslc1.txt'):
+    fields = record.split(';')
+    if fields[0] not in data:
+        data[fields[0]] = {
+            'marks': array([]),
+            'total': array([]),
+            'P': 0,
+            'F': 0,
+            'W': 0
+            }
+  \end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Dictionary - Building parsed data \ldots}
+  \small
+  \begin{lstlisting}
+data[fields[0]]['marks'] = append(
+    data[fields[0]]['marks'], 
+    [int(fields[3]), int(fields[4]),
+    int(fields[5]), int(fields[6]),
+    int(fields[7])
+    ])
+
+data[fields[0]]['total'].append(fields[8])
+
+pfw_key = fields[9] or fields[10] or fields[11]
+data[fields[0]][pfw_key] += 1
+  \end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Calculations}
+  \begin{lstlisting}
+all_sub_avg = array([])
+for k, v in data:
+    data[k]['avg'] = average(
+        data[k]['total'])
+    data[k]['sub_avg'] = average(
+        data[k]['marks'], axis=1)
+  \end{lstlisting}
+\end{frame}
+
 \end{document}
 
 Least squares: Smooth curve fit.
author	Santosh G. Vattam <vattam.santosh@gmail.com>
	Thu, 15 Oct 2009 11:23:47 +0530
changeset 122	73374e1ae4f3
parent 121	ac715f4826f2 (current diff)
parent 120	055b199c46c2 (diff)
child 124	d43a698712e0