day1/session3.tex
changeset 194 7288d3867df2
parent 185 e59ab9ab1a89
child 192 1574b3bc6be7
child 199 85a799d90c6a
equal deleted inserted replaced
193:a31b0e76c0fb 194:7288d3867df2
   125 %%   \tableofcontents
   125 %%   \tableofcontents
   126 %%   % You might wish to add the option [pausesections]
   126 %%   % You might wish to add the option [pausesections]
   127 %% \end{frame}
   127 %% \end{frame}
   128 
   128 
   129 \begin{frame}
   129 \begin{frame}
       
   130   \frametitle{More on data processing}
       
   131   \begin{block}{}
       
   132     What do we do if we want to draw Pie charts for the data in a huge data file?
       
   133   \end{block}
       
   134 \end{frame}
       
   135 
       
   136 
       
   137 \begin{frame}
   130   \frametitle{Statistical Analysis and Parsing}
   138   \frametitle{Statistical Analysis and Parsing}
   131   Read the data supplied in \emph{sslc1.txt} and obtain the following statistics:
   139   Read the data supplied in \emph{sslc1.txt} and obtain the following statistics:
   132   \begin{itemize}
   140   \begin{itemize}
   133     \item Average total marks scored in each region
   141     \item Draw a pie chart representing the number of students who scored more than 90\% in Science per region.
   134     \item Subject wise average score of each region
   142     \item Draw a pie chart representing the number of students who scored more than 90\% per subject(All regions combined).
   135     \item \alert{??Subject wise average score for all regions combined??}
       
   136     \item Find the subject wise standard deviation of scores for each region
       
   137   \end{itemize}
   143   \end{itemize}
   138 \end{frame}
   144 \end{frame}
   139 
   145 
   140 \begin{frame}
   146 \begin{frame}
   141   \frametitle{Statistical Analysis and Parsing \ldots}
   147   \frametitle{Statistical Analysis and Parsing \ldots}
   142   Machinery Required -
   148   Machinery Required -
   143   \begin{itemize}
   149   \begin{itemize}
   144     \item File reading and parsing
   150     \item File reading and parsing
   145     \item NumPy arrays - sum by rows and sum by coloumns
       
   146     \item Dictionaries
   151     \item Dictionaries
   147   \end{itemize}
   152   \end{itemize}
   148 \end{frame}
   153 \end{frame}
   149 
   154 
   150 \begin{frame}
   155 \begin{frame}
   181 \end{frame}
   186 \end{frame}
   182 
   187 
   183 \begin{frame}[fragile]
   188 \begin{frame}[fragile]
   184   \frametitle{Dictionary - Building parsed data}
   189   \frametitle{Dictionary - Building parsed data}
   185   \begin{itemize}
   190   \begin{itemize}
   186     \item Let the parsed data be stored in dictionary \typ{data}
   191     \item Let the parsed data be stored in list of dictionaries.
   187     \item \begin{lstlisting}
   192     \item d = \{\} is an empty dictionary
   188 data = {}  # is an empty dictionary
   193   \end{itemize}
   189 \end{lstlisting}
   194 \end{frame}
       
   195 
       
   196 \begin{frame}[fragile]
       
   197   \frametitle{Dictionary - Building parsed data}
       
   198 \begin{lstlisting}
       
   199 ninety_percents = [{}, {}, {}, {}, {}]
       
   200 \end{lstlisting}
       
   201 \end{frame}
       
   202 
       
   203 \begin{frame}[fragile]
       
   204   \frametitle{Dictionary - Building parsed data}
       
   205   \begin{itemize}
   190     \item Index of a dictionary is called a \emph{key}
   206     \item Index of a dictionary is called a \emph{key}
   191     \item \emph{Keys} of \typ{data} are strings - region codes
   207     \item \emph{Keys} of these dictionaries are strings - region codes
   192     \item Value of a \emph{key} can be any Python object
   208   \end{itemize}
   193   \end{itemize}
   209 \end{frame}
   194 \end{frame}
   210 
   195 
   211 \begin{frame}[fragile]
   196 \begin{frame}[fragile]
   212   \frametitle{Dictionary - Building parsed data \ldots}
   197   \frametitle{Dictionary - Building parsed data...}
   213   \begin{itemize}
   198   \begin{itemize}
   214     \item Value of a \emph{key} can be any legal Python value
   199     \item In this problem let the value of a \emph{key} be another dictionary.
   215     \item In this problem let the value of a \emph{key} be another an integer
   200     \item This dictionary contains:
   216     \item This dictionary contains:
   201     \begin{itemize}
   217   \end{itemize}
   202       \item 'marks': A \emph{List} of \emph{Lists} containing all marks
   218 'region code': Number of students who scored more than 90\% in this region for this subject
   203       \item 'total': A \emph{List} of total marks of each student
   219 \end{frame}
   204       \item 'P': Number of passes
   220 
   205       \item 'F': Number of failures
   221 \begin{frame}[fragile]
   206       \item 'W': Number of withdrawls
   222   \frametitle{Building parsed data \ldots}
   207     \end{itemize}
   223   \begin{lstlisting}
   208   \end{itemize}
   224 from pylab import *
   209 \end{frame}
   225 
   210 
   226 ninety_percents = [{}, {}, {}, {}, {}]
   211 \begin{frame}[fragile]
   227 
   212   \frametitle{Dictionary - Building parsed data \ldots}
   228 for record in open('sslc1.txt'):
       
   229     record = record.strip()
       
   230     fields = record.split(';')
       
   231 
       
   232     region_code = fields[0].strip()
       
   233   \end{lstlisting}
       
   234 \end{frame}
       
   235 
       
   236 \begin{frame}[fragile]
       
   237   \frametitle{Building parsed data \ldots}
   213   \small
   238   \small
   214   \begin{lstlisting}
   239   \begin{lstlisting}
   215 data = {}
   240 for i, field in enumerate(fields[3:8]):
   216 for record in open('sslc1.txt'):
   241 
   217     fields = record.split(';')
   242     if region_code not in ninety_percents[i]:
   218     if fields[0] not in data:
   243         ninety_percents[i][region_code] = 0
   219         data[fields[0]] = {
   244 
   220             'marks': [],
       
   221             'total': [],
       
   222             'P': 0,
       
   223             'F': 0,
       
   224             'W': 0
       
   225             }
       
   226   \end{lstlisting}
       
   227 \end{frame}
       
   228 
       
   229 \begin{frame}[fragile]
       
   230   \frametitle{Dictionary - Building parsed data \ldots}
       
   231   \begin{lstlisting}
       
   232 marks = []
       
   233 for field in fields[3:8]:
       
   234     score_str = field.strip()
   245     score_str = field.strip()
   235     score = 0 if score_str == 'AA'
   246 
   236         or score_str == 'AAA'
   247     score = 0 if score_str == 'AA' else 
   237         or score_str == ''
   248                          int(score_str)
   238         else int(score_str)
   249     if score > 90:
   239     marks.append(score)
   250         ninety_percents[i][region_code] += 1
   240 
   251   \end{lstlisting}
   241 data[fields[0]]['marks'].append(marks)
   252 \end{frame}
   242   \end{lstlisting}
   253 
   243 \end{frame}
   254 \begin{frame}[fragile]
   244 
   255   \frametitle{Consolidating data}
   245 \begin{frame}[fragile]
   256   \begin{lstlisting}
   246   \frametitle{Dictionary - Building parsed data \ldots}
   257 subj_total = []
   247   \begin{lstlisting}
   258 for subject in ninety_percents:
   248 total = 0 if score_str == 'AA'
   259     subj_total.append(sum(
   249     or score_str == 'AAA'
   260          subject.values()))
   250     or score_str == ''
   261   \end{lstlisting}
   251     else int(fields[8])
   262 \end{frame}
   252 data[fields[0]]['total'].append(total)
   263 
   253   \end{lstlisting}
   264 \begin{frame}[fragile]
   254 \end{frame}
   265   \frametitle{Pie charts}
   255 
       
   256 \begin{frame}[fragile]
       
   257   \frametitle{Dictionary - Building parsed data \ldots}
       
   258   \begin{lstlisting}
       
   259 pfw_key = fields[9]
       
   260     or fields[10]
       
   261     or 'F'
       
   262 data[fields[0]][pfw_key] += 1
       
   263   \end{lstlisting}
       
   264 \end{frame}
       
   265 
       
   266 \begin{frame}[fragile]
       
   267   \frametitle{NumPy arrays}
       
   268   \centerline{\alert{But I lied!?!?!?}}
       
   269 \end{frame}
       
   270 
       
   271 \begin{frame}[fragile]
       
   272   \frametitle{Calculations}
       
   273   \begin{lstlisting}
       
   274 for k in data:
       
   275     data[k]['marks'] = array(
       
   276         data[k]['marks'])
       
   277     data[k]['total'] = array(
       
   278         data[k]['total'])
       
   279   \end{lstlisting}
       
   280 \end{frame}
       
   281 
       
   282 \begin{frame}[fragile]
       
   283   \frametitle{Calculations}
       
   284   \small
   266   \small
   285   \begin{lstlisting}
   267   \begin{lstlisting}
   286     data[k]['avg'] = average(
   268 figure(1)
   287         data[k]['total'])
   269 pie(ninety_percents[4].values(), 
   288     marks = data[k]['marks']
   270     labels=ninety_percents[1].keys())
   289     sub_avg = average(marks, axis=1)
   271 title('Students scoring 90% and above 
   290     sub_std = sqrt(sum(square(
   272       in science by region')
   291         sub_avg[:,newaxis] - marks), axis=0) /
   273 savefig('/tmp/science.png')
   292         len(marks))
   274   \end{lstlisting}
   293     data[k]['sub_avg'] = sub_avg
   275 \begin{columns}
   294     data[k]['sub_std'] = sub_std
   276     \column{5.25\textwidth}
   295   \end{lstlisting}
   277     \hspace*{1.1in}
   296 \end{frame}
   278 \includegraphics[height=2in, interpolate=true]{data/science}
   297 
   279     \column{0.8\textwidth}
   298 \begin{frame}[fragile]
   280 \end{columns}
   299   \frametitle{New Concepts}
   281 \end{frame}
   300   \begin{itemize}
   282 
   301    \item Dictionaries
   283 \begin{frame}[fragile]
   302    \item Slicing lists
   284   \frametitle{Pie charts}
   303    \item New type of conditional
   285   \begin{lstlisting}
   304    \item NumPy arrays
   286 figure(2)
   305    \item Slicing NumPy arrays
   287 pie(subj_total, labels=['English',
   306    \item NumPy array functions - square, average, sqrt
   288     'Hindi', 'Maths', 'Science',
   307   \end{itemize}
   289     'Social'])
       
   290 title('Students scoring more than
       
   291       90% by subject(All regions
       
   292       combined).')
       
   293 savefig('/tmp/all_regions.png')
       
   294   \end{lstlisting}
       
   295 \end{frame}
       
   296 
       
   297 \begin{frame}[fragile]
       
   298   \frametitle{Pie charts}
       
   299   \includegraphics[height=3in, interpolate=true]{data/all_regions}
   308 \end{frame}
   300 \end{frame}
   309 
   301 
   310 \begin{frame}[fragile]
   302 \begin{frame}[fragile]
   311 \frametitle{Dealing with data whole-sale}
   303 \frametitle{Dealing with data whole-sale}
   312 \begin{lstlisting}
   304 \begin{lstlisting}
   313 In []: for t in T:
   305 In []: for t in T:
   314  ....:     Tsq.append(t*t)
   306  ....:     TSq.append(t*t)
   315 \end{lstlisting}
   307 \end{lstlisting}
   316 \begin{itemize}
   308 \begin{itemize}
   317 \item This is not very efficient
   309 \item This is not very efficient
   318 \item We are squaring element after element
   310 \item We are squaring element after element
   319 \item We use arrays to make this efficient
   311 \item We use arrays to make this efficient
   320 \end{itemize}
   312 \end{itemize}
   321 \begin{lstlisting}
   313 \begin{lstlisting}
   322 In []: L = array(L)
   314 In []: L = array(L)
   323 In []: T = array(T)
   315 In []: T = array(T)
   324 In []: Tsq = T*T
   316 In []: TSq = T*T
   325 \end{lstlisting}
   317 \end{lstlisting}
   326 \end{frame}
   318 \end{frame}
   327 
   319 
   328 \begin{frame}[fragile]
   320 \begin{frame}[fragile]
   329 \frametitle{Arrays}
   321 \frametitle{Arrays}
   407 \begin{itemize}
   399 \begin{itemize}
   408 \item Now use the \typ{lstsq} function
   400 \item Now use the \typ{lstsq} function
   409 \item Along with a lot of things, it returns the least squares solution
   401 \item Along with a lot of things, it returns the least squares solution
   410 \end{itemize}
   402 \end{itemize}
   411 \begin{lstlisting}
   403 \begin{lstlisting}
   412 In []: coef, res, r, s = lstsq(A,Tsq)
   404 In []: coef, res, r, s = lstsq(A,TSq)
   413 \end{lstlisting}
   405 \end{lstlisting}
   414 \end{frame}
   406 \end{frame}
   415 
   407 
   416 \begin{frame}[fragile]
   408 \begin{frame}[fragile]
   417 \frametitle{Least Square Fit Line \ldots}
   409 \frametitle{Least Square Fit Line \ldots}
   425 \begin{lstlisting}
   417 \begin{lstlisting}
   426 In []: plot(L, Tline)
   418 In []: plot(L, Tline)
   427 \end{lstlisting}
   419 \end{lstlisting}
   428 \end{frame}
   420 \end{frame}
   429 
   421 
       
   422 \begin{frame}[fragile]
       
   423   \frametitle{What did we learn?}
       
   424   \begin{itemize}
       
   425    \item Dictionaries
       
   426    \item Drawing pie charts
       
   427    \item Arrays
       
   428    \item Least Square fitting
       
   429    \item Intro to Matrices
       
   430   \end{itemize}
       
   431 \end{frame}
   430 \end{document}
   432 \end{document}