day1/session3.tex
changeset 204 87f914f38ba1
parent 192 1574b3bc6be7
child 205 bba40c856f68
equal deleted inserted replaced
196:59138cb18119 204:87f914f38ba1
   138   \frametitle{Statistical Analysis and Parsing}
   138   \frametitle{Statistical Analysis and Parsing}
   139   Read the data supplied in \emph{sslc1.txt} and obtain the following statistics:
   139   Read the data supplied in \emph{sslc1.txt} and obtain the following statistics:
   140   \begin{itemize}
   140   \begin{itemize}
   141     \item Draw a pie chart representing the number of students who scored more than 90\% in Science per region.
   141     \item Draw a pie chart representing the number of students who scored more than 90\% in Science per region.
   142     \item Draw a pie chart representing the number of students who scored more than 90\% per subject(All regions combined).
   142     \item Draw a pie chart representing the number of students who scored more than 90\% per subject(All regions combined).
       
   143     \item Print mean, median, mode and standard deviation of math scores for all regions combined.
   143   \end{itemize}
   144   \end{itemize}
   144 \end{frame}
   145 \end{frame}
   145 
   146 
   146 \begin{frame}
   147 \begin{frame}
   147   \frametitle{Statistical Analysis and Parsing \ldots}
   148   \frametitle{Statistical Analysis and Parsing \ldots}
   148   Machinery Required -
   149   Machinery Required -
   149   \begin{itemize}
   150   \begin{itemize}
   150     \item File reading and parsing
   151     \item File reading
       
   152     \item Parsing
   151     \item Dictionaries
   153     \item Dictionaries
       
   154     \item NumPy arrays
       
   155     \item Statistical operations
   152   \end{itemize}
   156   \end{itemize}
   153 \end{frame}
   157 \end{frame}
   154 
   158 
   155 \begin{frame}
   159 \begin{frame}
   156   \frametitle{File reading and parsing}
   160   \frametitle{File reading and parsing}
   157   Understanding the structure of sslc1.txt
   161   Understanding the structure of sslc1.txt
   158   \begin{itemize}
   162   \begin{itemize}
   159     \item Each line in the file, i.e each row of a file is a single record.
   163     \item Each line in the file corresponds to one student's details
   160     \item Each record corresponds to a record of a single student
   164     \item aka record
   161     \item Each record consists of several fields separated by a ';'
   165     \item Each record consists of several fields separated by a ';'
   162   \end{itemize}
   166   \end{itemize}
   163 \end{frame}
   167 \end{frame}
   164 
   168 
   165 \begin{frame}
   169 \begin{frame}
   167   Each record consists of:
   171   Each record consists of:
   168   \begin{itemize}
   172   \begin{itemize}
   169     \item Region Code
   173     \item Region Code
   170     \item Roll Number
   174     \item Roll Number
   171     \item Name
   175     \item Name
   172     \item Marks of 5 subjects
   176     \item Marks of 5 subjects: English, Hindi, Maths, Science, Social
   173     \item Total marks
   177     \item Total marks
   174     \item Pass (P)
   178     \item Pass/Fail (P/F)
   175     \item Withdrawn (W)
   179     \item Withdrawn (W)
   176     \item Fail (F)
       
   177   \end{itemize}
   180   \end{itemize}
   178 \end{frame}
   181 \end{frame}
   179 
   182 
   180 \begin{frame}[fragile]
   183 \begin{frame}[fragile]
   181   \frametitle{File reading and parsing \ldots}
   184   \frametitle{File reading and parsing \ldots}
   184     fields = record.split(';')
   187     fields = record.split(';')
   185   \end{lstlisting}
   188   \end{lstlisting}
   186 \end{frame}
   189 \end{frame}
   187 
   190 
   188 \begin{frame}[fragile]
   191 \begin{frame}[fragile]
       
   192   \frametitle{Dictionary: Introduction}
       
   193   \begin{itemize}
       
   194     \item lists index: 0 \ldots n
       
   195     \item dictionaries index using strings
       
   196   \end{itemize}
       
   197 \begin{block}{Example}
       
   198 d = \{ ``Hitchhiker's guide'' : 42,
       
   199      ``Terminator'' : ``I'll be back''\}\\
       
   200 d[``Terminator''] => ``I'll be back''
       
   201 \end{block}
       
   202 \end{frame}
       
   203 
       
   204 \begin{frame}[fragile]
       
   205   \frametitle{Dictionary: Introduction}
       
   206 \begin{lstlisting}
       
   207 In [1]: d = {"Hitchhiker's guide" : 42,
       
   208       "Terminator" : "I'll be back"}
       
   209 
       
   210 In [2]: d["Hitchhiker's guide"]
       
   211 Out[2]: 42
       
   212 
       
   213 In [3]: "Hitchhiker's guide" in d
       
   214 Out[3]: True
       
   215 
       
   216 In [4]: "Guido" in d
       
   217 Out[4]: False
       
   218 \end{lstlisting}
       
   219 \end{frame}
       
   220 
       
   221 \begin{frame}[fragile]
       
   222   \frametitle{Dictionary: Introduction}
       
   223 \begin{lstlisting}
       
   224 In [5]: d.keys()
       
   225 Out[5]: ['Terminator', "Hitchhiker's 
       
   226                               guide"]
       
   227 
       
   228 In [6]: d.values()
       
   229 Out[6]: ["I'll be back", 42]
       
   230 \end{lstlisting}
       
   231 \end{frame}
       
   232 
       
   233 \begin{frame}[fragile]
       
   234   \frametitle{enumerate: Iterating through list indices}
       
   235 \begin{lstlisting}
       
   236 In [1]: names = ["Guido","Alex", "Tim"]
       
   237 
       
   238 In [2]: for i, name in enumerate(names):
       
   239    ...:     print i, name
       
   240    ...: 
       
   241 0 Guido
       
   242 1 Alex
       
   243 2 Tim
       
   244 \end{lstlisting}
       
   245 \end{frame}
       
   246 
       
   247 \begin{frame}[fragile]
       
   248   \frametitle{Dictionary: Building parsed data}
       
   249     Let our dictionary be:
       
   250     \begin{lstlisting}
       
   251 science = {} # is an empty dictionary
       
   252     \end{lstlisting}
       
   253 \end{frame}
       
   254 
       
   255 \begin{frame}[fragile]
   189   \frametitle{Dictionary - Building parsed data}
   256   \frametitle{Dictionary - Building parsed data}
   190   \begin{itemize}
   257   \begin{itemize}
   191     \item Let the parsed data be stored in list of dictionaries.
   258     \item \emph{Keys} of \emph{science} will be region codes
   192     \item d = \{\} is an empty dictionary
   259     \item Value of a \emph{science} will be the number students who scored more than 90\% in that region
   193   \end{itemize}
   260   \end{itemize}
   194 \end{frame}
       
   195 
       
   196 \begin{frame}[fragile]
       
   197   \frametitle{Dictionary - Building parsed data}
       
   198 \begin{lstlisting}
       
   199 ninety_percents = [{}, {}, {}, {}, {}]
       
   200 \end{lstlisting}
       
   201 \end{frame}
       
   202 
       
   203 \begin{frame}[fragile]
       
   204   \frametitle{Dictionary - Building parsed data}
       
   205   \begin{itemize}
       
   206     \item Index of a dictionary is called a \emph{key}
       
   207     \item \emph{Keys} of these dictionaries are strings - region codes
       
   208   \end{itemize}
       
   209 \end{frame}
       
   210 
       
   211 \begin{frame}[fragile]
       
   212   \frametitle{Dictionary - Building parsed data \ldots}
       
   213   \begin{itemize}
       
   214     \item Value of a \emph{key} can be any legal Python value
       
   215     \item In this problem let the value of a \emph{key} be another an integer
       
   216     \item This dictionary contains:
       
   217   \end{itemize}
       
   218 'region code': Number of students who scored more than 90\% in this region for this subject
       
   219 \end{frame}
   261 \end{frame}
   220 
   262 
   221 \begin{frame}[fragile]
   263 \begin{frame}[fragile]
   222   \frametitle{Building parsed data \ldots}
   264   \frametitle{Building parsed data \ldots}
   223   \begin{lstlisting}
   265   \begin{lstlisting}
   224 from pylab import *
   266 from pylab import pie
   225 
   267 
   226 ninety_percents = [{}, {}, {}, {}, {}]
   268 science = {}
   227 
   269 
   228 for record in open('sslc1.txt'):
   270 for record in open('sslc1.txt'):
   229     record = record.strip()
   271     record = record.strip()
   230     fields = record.split(';')
   272     fields = record.split(';')
   231 
   273 
   233   \end{lstlisting}
   275   \end{lstlisting}
   234 \end{frame}
   276 \end{frame}
   235 
   277 
   236 \begin{frame}[fragile]
   278 \begin{frame}[fragile]
   237   \frametitle{Building parsed data \ldots}
   279   \frametitle{Building parsed data \ldots}
   238   \small
   280   \begin{lstlisting}
   239   \begin{lstlisting}
   281 if region_code not in science:
   240 for i, field in enumerate(fields[3:8]):
   282     science[region_code] = 0
   241 
   283 
   242     if region_code not in ninety_percents[i]:
   284 score_str = fields[4].strip()
   243         ninety_percents[i][region_code] = 0
   285 
   244 
   286 score = int(score_str) if
   245     score_str = field.strip()
   287     score_str != 'AA' else 0
   246 
   288 
   247     score = 0 if score_str == 'AA' else 
   289 if score > 90:
   248                          int(score_str)
   290     science[region_code] += 1
   249     if score > 90:
       
   250         ninety_percents[i][region_code] += 1
       
   251   \end{lstlisting}
       
   252 \end{frame}
       
   253 
       
   254 \begin{frame}[fragile]
       
   255   \frametitle{Consolidating data}
       
   256   \begin{lstlisting}
       
   257 subj_total = []
       
   258 for subject in ninety_percents:
       
   259     subj_total.append(sum(
       
   260          subject.values()))
       
   261   \end{lstlisting}
   291   \end{lstlisting}
   262 \end{frame}
   292 \end{frame}
   263 
   293 
   264 \begin{frame}[fragile]
   294 \begin{frame}[fragile]
   265   \frametitle{Pie charts}
   295   \frametitle{Pie charts}
   266   \small
   296   \small
   267   \begin{lstlisting}
   297   \begin{lstlisting}
   268 figure(1)
   298 figure(1)
   269 pie(ninety_percents[4].values(), 
   299 pie(science.values(), 
   270     labels=ninety_percents[1].keys())
   300     labels=science.keys())
   271 title('Students scoring 90% and above 
   301 title('Students scoring 90% and above 
   272       in science by region')
   302       in science by region')
   273 savefig('/tmp/science.png')
   303 savefig('/tmp/science.png')
   274   \end{lstlisting}
   304   \end{lstlisting}
   275 \begin{columns}
   305 \begin{columns}
   276     \column{5.25\textwidth}
   306     \column{5.25\textwidth}
   277     \hspace*{1.1in}
   307     \hspace*{1.1in}
   278 \includegraphics[height=2in, interpolate=true]{data/science}
   308 \includegraphics[height=2in, interpolate=true]{data/science}
   279     \column{0.8\textwidth}
   309     \column{0.8\textwidth}
   280 \end{columns}
   310 \end{columns}
       
   311 \end{frame}
       
   312 
       
   313 \begin{frame}[fragile]
       
   314   \frametitle{Building data for all subjects \ldots}
       
   315   \begin{lstlisting}
       
   316 from pylab import pie
       
   317 from scipy import mean, median, std
       
   318 from scipy import stats
       
   319 
       
   320 scores = [[]] * 5
       
   321 ninety_percents = [{}] * 5
       
   322   \end{lstlisting}
       
   323 \end{frame}
       
   324 
       
   325 \begin{frame}[fragile]
       
   326   \frametitle{Building data for all subjects \ldots}
       
   327   \begin{lstlisting}
       
   328 from pylab import pie
       
   329 from scipy import mean, median, std
       
   330 from scipy import stats
       
   331   \end{lstlisting}
       
   332 
       
   333   \begin{block}{Repeating list items}
       
   334     \begin{lstlisting}
       
   335 scores = [[]] * 5
       
   336 ninety_percents = [{}] * 5
       
   337     \end{lstlisting}
       
   338   \end{block}
       
   339 \end{frame}
       
   340 
       
   341 \begin{frame}[fragile]
       
   342   \frametitle{Building data for all subjects \ldots}
       
   343   \begin{lstlisting}
       
   344 for record in open('sslc1.txt'):
       
   345     record = record.strip()
       
   346     fields = record.split(';')
       
   347 
       
   348     region_code = fields[0].strip()
       
   349   \end{lstlisting}
       
   350 \end{frame}
       
   351 
       
   352 \begin{frame}[fragile]
       
   353   \frametitle{Building data for all subjects \ldots}
       
   354   \small
       
   355   \begin{lstlisting}
       
   356 for i, field in enumerate(fields[3:8]):
       
   357     if region_code not in ninety_percents[i]:
       
   358         ninety_percents[i][region_code] = 0
       
   359 
       
   360     score_str = field.strip()
       
   361     score = int(score_str) if
       
   362       score_str != 'AA' else 0
       
   363 
       
   364     scores[i].append(score)
       
   365 
       
   366     if score > 90:
       
   367         ninety_percents[i][region_code] += 1
       
   368   \end{lstlisting}
       
   369 \end{frame}
       
   370 
       
   371 \begin{frame}[fragile]
       
   372   \frametitle{Consolidating data}
       
   373   \begin{lstlisting}
       
   374 subj_total = []
       
   375 for subject in ninety_percents:
       
   376     subj_total.append(sum(
       
   377          subject.values()))
       
   378   \end{lstlisting}
   281 \end{frame}
   379 \end{frame}
   282 
   380 
   283 \begin{frame}[fragile]
   381 \begin{frame}[fragile]
   284   \frametitle{Pie charts}
   382   \frametitle{Pie charts}
   285   \begin{lstlisting}
   383   \begin{lstlisting}
   295 \end{frame}
   393 \end{frame}
   296 
   394 
   297 \begin{frame}[fragile]
   395 \begin{frame}[fragile]
   298   \frametitle{Pie charts}
   396   \frametitle{Pie charts}
   299   \includegraphics[height=3in, interpolate=true]{data/all_regions}
   397   \includegraphics[height=3in, interpolate=true]{data/all_regions}
       
   398 \end{frame}
       
   399 
       
   400 \begin{frame}[fragile]
       
   401   \frametitle{Obtaining statistics}
       
   402   \begin{lstlisting}
       
   403 math_scores = array(scores[2])
       
   404 
       
   405 print "Mean: ", mean(math_scores)
       
   406 
       
   407 print "Median: ", median(math_scores)
       
   408 
       
   409 print "Mode: ", stats.mode(math_scores)
       
   410 
       
   411 print "Standard Deviation: ",
       
   412               std(math_scores)
       
   413   \end{lstlisting}
       
   414 \end{frame}
       
   415 
       
   416 \begin{frame}[fragile]
       
   417   \frametitle{What tools did we use?}
       
   418   \begin{itemize}
       
   419    \item Dictionaries for storing data
       
   420    \item Facilities for drawing pie charts
       
   421    \item NumPy arrays for efficient array manipulations
       
   422    \item Functions for statistical computations - mean, median, mode, standard deviation
       
   423   \end{itemize}
   300 \end{frame}
   424 \end{frame}
   301 
   425 
   302 \begin{frame}[fragile]
   426 \begin{frame}[fragile]
   303 \frametitle{Dealing with data whole-sale}
   427 \frametitle{Dealing with data whole-sale}
   304 \begin{lstlisting}
   428 \begin{lstlisting}
   417 \begin{lstlisting}
   541 \begin{lstlisting}
   418 In []: plot(L, Tline)
   542 In []: plot(L, Tline)
   419 \end{lstlisting}
   543 \end{lstlisting}
   420 \end{frame}
   544 \end{frame}
   421 
   545 
   422 \begin{frame}[fragile]
       
   423   \frametitle{What did we learn?}
       
   424   \begin{itemize}
       
   425    \item Dictionaries
       
   426    \item Drawing pie charts
       
   427    \item Arrays
       
   428    \item Least Square fitting
       
   429    \item Intro to Matrices
       
   430   \end{itemize}
       
   431 \end{frame}
       
   432 \end{document}
   546 \end{document}