day1/session3.tex
changeset 249 135062d6f91f
parent 240 5a96cf81bdc5
child 252 359f77927931
equal deleted inserted replaced
248:1ebf842cb035 249:135062d6f91f
   124 %%   \frametitle{Outline}
   124 %%   \frametitle{Outline}
   125 %%   \tableofcontents
   125 %%   \tableofcontents
   126 %%   % You might wish to add the option [pausesections]
   126 %%   % You might wish to add the option [pausesections]
   127 %% \end{frame}
   127 %% \end{frame}
   128 
   128 
   129 \section{Statistics}
   129 \section{Processing voluminous data}
   130 \begin{frame}
   130 \begin{frame}
   131   \frametitle{More on data processing}
   131   \frametitle{More on data processing}
   132   \begin{block}{}
   132   \begin{block}{}
   133     We have a huge--1m records--data file.\\How do we do \emph{efficient} statistical computations, that is find mean, median, mode, standard deveiation etc; draw pie charts?
   133     We have a huge--1m records--data file.\\How do we do \emph{efficient} statistical computations, that is find mean, median, mode, standard deveiation etc; draw pie charts?
   134   \end{block}
   134   \end{block}
   135 \end{frame}
   135 \end{frame}
   136 
   136 
   137 
   137 
   138 \begin{frame}
   138 \begin{frame}
   139   \frametitle{Statistical Analysis and Parsing}
   139   \frametitle{Statistical Analysis: Problem statement}
   140   Read the data supplied in \emph{sslc1.txt} and obtain the following statistics:
   140   Read the data supplied in \emph{sslc1.txt} and carry out the following:
   141   \begin{itemize}
   141   \begin{enumerate}
   142     \item Draw a pie chart representing the number of students who scored more than 90\% in Science per region.
   142     \item Draw a pie chart representing the proportion of students who scored more than 90\% in each region in Science.
   143     \item Draw a pie chart representing the number of students who scored more than 90\% per subject(All regions combined).
   143     \item Draw a pie chart representing the proportion of students who scored more than 90\% in each subject across regions.
   144     \item Print mean, median, mode and standard deviation of math scores for all regions combined.
   144     \item Print mean, median, mode and standard deviation of math scores for all regions combined.
   145   \end{itemize}
   145   \end{enumerate}
       
   146 \end{frame}
       
   147 
       
   148 \begin{frame}
       
   149   \frametitle{Problem statement: explanation}
       
   150     \emphbar{Draw a pie chart representing the proportion of students who scored more than 90\% in each region in Science.}
       
   151     \begin{enumerate}
       
   152       \item Complete(100\%) data - Number of students who scored more than 90\% in Science
       
   153       \item Each slice - Number of students who scored more than 90\% in Science in one region
       
   154     \end{enumerate}
       
   155 \end{frame}
       
   156 
       
   157 \begin{frame}
       
   158   \frametitle{Problem statement: explanation}
       
   159     \emphbar{Draw a pie chart representing the proportion of students who scored more than 90\% in each subject across regions.}
       
   160     \begin{enumerate}
       
   161       \item Complete(100\%) data - Number of students who scored more than 90\% across all regions
       
   162       \item Each slice - Number of students who scored more than 90\% in each subject across all regions
       
   163     \end{enumerate}
   146 \end{frame}
   164 \end{frame}
   147 
   165 
   148 \begin{frame}
   166 \begin{frame}
   149   \frametitle{Statistical Analysis and Parsing \ldots}
   167   \frametitle{Statistical Analysis and Parsing \ldots}
   150   Machinery Required -
   168   Machinery Required -
   159 
   177 
   160 \begin{frame}
   178 \begin{frame}
   161   \frametitle{File reading and parsing}
   179   \frametitle{File reading and parsing}
   162   Understanding the structure of sslc1.txt
   180   Understanding the structure of sslc1.txt
   163   \begin{itemize}
   181   \begin{itemize}
   164     \item Each line in the file corresponds to one student's details
   182     \item One line in file corresponds to a student's details
   165     \item aka record
   183     \item aka record
   166     \item Each record consists of several fields separated by a ';'
   184     \item Each record consists of fields separated by ';'
   167   \end{itemize}
   185   \end{itemize}
   168 \end{frame}
   186 \end{frame}
   169 
   187 
   170 \begin{frame}
   188 \begin{frame}
   171   \frametitle{File reading and parsing \ldots}
   189   \frametitle{File reading and parsing \ldots}
   233 Out[6]: ["I'll be back", 42]
   251 Out[6]: ["I'll be back", 42]
   234   \end{lstlisting}
   252   \end{lstlisting}
   235 \end{frame}
   253 \end{frame}
   236 
   254 
   237 \begin{frame}[fragile]
   255 \begin{frame}[fragile]
       
   256   \frametitle{Back to lists: Iterating}
       
   257   \begin{itemize}
       
   258     \item Python's \kwrd{for} loop iterates through list items
       
   259     \item In other languages (C/C++) we run through indices and pick items from the array using these indices
       
   260     \item In Python, while iterating through list items current position is not available
       
   261   \end{itemize}
       
   262   \begin{block}{Iterating through indices}
       
   263     What if we want the index of an item of a list?
       
   264   \end{block}
       
   265 
       
   266 \end{frame}
       
   267 
       
   268 \begin{frame}[fragile]
   238   \frametitle{enumerate: Iterating through list indices}
   269   \frametitle{enumerate: Iterating through list indices}
   239   \begin{lstlisting}
   270   \begin{lstlisting}
   240 In [1]: names = ["Guido","Alex", "Tim"]
   271 In [1]: names = ["Guido","Alex", "Tim"]
   241 
   272 
   242 In [2]: for i, name in enumerate(names):
   273 In [2]: for i, name in enumerate(names):
   248   \end{lstlisting}
   279   \end{lstlisting}
   249   \inctime{5}
   280   \inctime{5}
   250 \end{frame}
   281 \end{frame}
   251 
   282 
   252 \begin{frame}[fragile]
   283 \begin{frame}[fragile]
   253   \frametitle{Dictionary: Building parsed data}
   284   \frametitle{Continuing with our Dictionary}
   254   Let our dictionary be:
   285   Let our dictionary be:
   255   \begin{lstlisting}
   286   \begin{lstlisting}
   256 science = {} # is an empty dictionary
   287 science = {} # is an empty dictionary
   257   \end{lstlisting}
   288   \end{lstlisting}
   258 \end{frame}
   289 \end{frame}
   294 if score > 90:
   325 if score > 90:
   295     science[region_code] += 1
   326     science[region_code] += 1
   296   \end{lstlisting}
   327   \end{lstlisting}
   297 \end{frame}
   328 \end{frame}
   298 
   329 
   299 \subsection{Visualizing the data}
   330 \subsection{Visualizing data}
   300 \begin{frame}[fragile]
   331 \begin{frame}[fragile]
   301   \frametitle{Pie charts}
   332   \frametitle{Pie charts}
   302   \small
   333   \small
   303   \begin{lstlisting}
   334   \begin{lstlisting}
   304 figure(1)
   335 figure(1)
   386 \begin{frame}[fragile]
   417 \begin{frame}[fragile]
   387   \frametitle{Pie charts}
   418   \frametitle{Pie charts}
   388   \includegraphics[height=3in, interpolate=true]{data/all_regions}
   419   \includegraphics[height=3in, interpolate=true]{data/all_regions}
   389 \end{frame}
   420 \end{frame}
   390 
   421 
   391 \subsection{Obtaining stastics}
   422 \subsection{Obtaining statistics}
       
   423 \begin{frame}[fragile]
       
   424   \frametitle{Obtaining statistics}
       
   425   \begin{block}{Statistics: Mean}
       
   426     Obtain the mean of Math scores
       
   427   \end{block}
       
   428 \end{frame}
       
   429 
       
   430 \begin{frame}[fragile]
       
   431   \frametitle{Obtaining statistics: Solution}
       
   432   \begin{block}{Statistics: Mean}
       
   433     Obtain the mean of Math scores
       
   434   \end{block}
       
   435   \begin{lstlisting}
       
   436 math_scores = scores[2]
       
   437 total = 0
       
   438 for i, score in enumerate(math_scores):
       
   439     total += score
       
   440 
       
   441 mean = total / (i + 1)
       
   442 print "Mean: ", mean
       
   443   \end{lstlisting}
       
   444 \end{frame}
       
   445 
       
   446 \begin{frame}[fragile]
       
   447   \frametitle{Obtaining statistics: Another solution}
       
   448   \begin{block}{Statistics: Mean}
       
   449     Obtain the mean of Math scores
       
   450   \end{block}
       
   451   \begin{lstlisting}
       
   452 math_scores = scores[2]
       
   453 mean = sum(math_scores) /
       
   454           len(math_scores)
       
   455   \end{lstlisting}
       
   456 \end{frame}
       
   457 
       
   458 \begin{frame}[fragile]
       
   459 \frametitle{NumPy arrays}
       
   460   \begin{itemize}
       
   461     \item NumPy provides arrays
       
   462     \item arrays are very efficient and powerful 
       
   463     \item Very easy to perform element-wise operations - \typ{+, -, *, /, \%}
       
   464     \begin{lstlisting}
       
   465 In [1]: a = array([1, 2, 3])
       
   466 In [2]: b = array([4, 5, 6])
       
   467 
       
   468 In [3]: a + b
       
   469 Out[3]: array([5, 7, 9])
       
   470     \end{lstlisting}
       
   471     \item Very easy to compute statistics
       
   472   \end{itemize}
       
   473 \end{frame}
       
   474 
   392 \begin{frame}[fragile]
   475 \begin{frame}[fragile]
   393   \frametitle{Obtaining statistics}
   476   \frametitle{Obtaining statistics}
   394   \begin{lstlisting}
   477   \begin{lstlisting}
   395 math_scores = array(scores[2])
   478 math_scores = array(scores[2])
   396 
   479 
   414    \item NumPy arrays for efficient array manipulations
   497    \item NumPy arrays for efficient array manipulations
   415    \item Functions for statistical computations - mean, median, mode, standard deviation
   498    \item Functions for statistical computations - mean, median, mode, standard deviation
   416   \end{itemize}
   499   \end{itemize}
   417 \end{frame}
   500 \end{frame}
   418 
   501 
       
   502 \section{Least square fit}
   419 \begin{frame}
   503 \begin{frame}
   420 \frametitle{L vs $T^2$ \ldots}
   504 \frametitle{L vs $T^2$ \ldots}
   421 Let's go back to the L vs $T^2$ plot
   505 Let's go back to the L vs $T^2$ plot
   422 \begin{itemize}
   506 \begin{itemize}
   423 \item We first look at obtaining $T^2$ from T
   507 \item We first look at obtaining $T^2$ from T
   439 \begin{lstlisting}
   523 \begin{lstlisting}
   440 In []: L = array(L)
   524 In []: L = array(L)
   441 In []: T = array(T)
   525 In []: T = array(T)
   442 In []: TSq = T*T
   526 In []: TSq = T*T
   443 \end{lstlisting}
   527 \end{lstlisting}
   444 \end{frame}
       
   445 
       
   446 \begin{frame}[fragile]
       
   447 \frametitle{Arrays}
       
   448 \begin{itemize}
       
   449 \item \typ{T} and \typ{L} are now arrays
       
   450 \item arrays are very efficient and powerful 
       
   451 \item Very easy to perform element-wise operations
       
   452 \item \typ{+, -, *, /, \%}
       
   453 \item More about arrays later
       
   454 \end{itemize}
       
   455 \end{frame}
   528 \end{frame}
   456 
   529 
   457 \begin{frame}[fragile]
   530 \begin{frame}[fragile]
   458 \frametitle{Least Squares Fit}
   531 \frametitle{Least Squares Fit}
   459 \vspace{-0.15in}
   532 \vspace{-0.15in}
   506   \end{bmatrix}$
   579   \end{bmatrix}$
   507 \item We need to find $p$ to plot the line
   580 \item We need to find $p$ to plot the line
   508 \end{itemize}
   581 \end{itemize}
   509 \end{frame}
   582 \end{frame}
   510 
   583 
       
   584 \subsection{Van der Monde matrix generation}
   511 \begin{frame}[fragile]
   585 \begin{frame}[fragile]
   512 \frametitle{Van der Monde Matrix}
   586 \frametitle{Van der Monde Matrix}
   513 \begin{itemize}
   587 \begin{itemize}
   514 \item A is also called a Van der Monde matrix
   588 \item A is also called a Van der Monde matrix
   515 \item It can be generated using \typ{vander}
   589 \item It can be generated using \typ{vander}
   537 \begin{lstlisting}
   611 \begin{lstlisting}
   538 In []: coef, res, r, s = lstsq(A,TSq)
   612 In []: coef, res, r, s = lstsq(A,TSq)
   539 \end{lstlisting}
   613 \end{lstlisting}
   540 \end{frame}
   614 \end{frame}
   541 
   615 
       
   616 \subsection{Plotting}
   542 \begin{frame}[fragile]
   617 \begin{frame}[fragile]
   543 \frametitle{Least Square Fit Line \ldots}
   618 \frametitle{Least Square Fit Line \ldots}
   544 We get the points of the line from \typ{coef}
   619 We get the points of the line from \typ{coef}
   545 \begin{lstlisting}
   620 \begin{lstlisting}
   546 In []: Tline = coef[0]*L + coef[1]
   621 In []: Tline = coef[0]*L + coef[1]
   551 \begin{lstlisting}
   626 \begin{lstlisting}
   552 In []: plot(L, Tline)
   627 In []: plot(L, Tline)
   553 \end{lstlisting}
   628 \end{lstlisting}
   554 \end{frame}
   629 \end{frame}
   555 
   630 
       
   631 \begin{frame}[fragile]
       
   632   \frametitle{What did we learn?}
       
   633   \begin{itemize}
       
   634    \item Least square fit
       
   635    \item Van der Monde matrix generation
       
   636    \item Plotting the least square fit curve
       
   637   \end{itemize}
       
   638 \end{frame}
       
   639 
   556 \end{document}
   640 \end{document}