124 %% \frametitle{Outline} |
124 %% \frametitle{Outline} |
125 %% \tableofcontents |
125 %% \tableofcontents |
126 %% % You might wish to add the option [pausesections] |
126 %% % You might wish to add the option [pausesections] |
127 %% \end{frame} |
127 %% \end{frame} |
128 |
128 |
129 \section{Statistics} |
129 \section{Processing voluminous data} |
130 \begin{frame} |
130 \begin{frame} |
131 \frametitle{More on data processing} |
131 \frametitle{More on data processing} |
132 \begin{block}{} |
132 \begin{block}{} |
133 We have a huge--1m records--data file.\\How do we do \emph{efficient} statistical computations, that is find mean, median, mode, standard deveiation etc; draw pie charts? |
133 We have a huge--1m records--data file.\\How do we do \emph{efficient} statistical computations, that is find mean, median, mode, standard deveiation etc; draw pie charts? |
134 \end{block} |
134 \end{block} |
135 \end{frame} |
135 \end{frame} |
136 |
136 |
137 |
137 |
138 \begin{frame} |
138 \begin{frame} |
139 \frametitle{Statistical Analysis and Parsing} |
139 \frametitle{Statistical Analysis: Problem statement} |
140 Read the data supplied in \emph{sslc1.txt} and obtain the following statistics: |
140 Read the data supplied in \emph{sslc1.txt} and carry out the following: |
141 \begin{itemize} |
141 \begin{enumerate} |
142 \item Draw a pie chart representing the number of students who scored more than 90\% in Science per region. |
142 \item Draw a pie chart representing the proportion of students who scored more than 90\% in each region in Science. |
143 \item Draw a pie chart representing the number of students who scored more than 90\% per subject(All regions combined). |
143 \item Draw a pie chart representing the proportion of students who scored more than 90\% in each subject across regions. |
144 \item Print mean, median, mode and standard deviation of math scores for all regions combined. |
144 \item Print mean, median, mode and standard deviation of math scores for all regions combined. |
145 \end{itemize} |
145 \end{enumerate} |
|
146 \end{frame} |
|
147 |
|
148 \begin{frame} |
|
149 \frametitle{Problem statement: explanation} |
|
150 \emphbar{Draw a pie chart representing the proportion of students who scored more than 90\% in each region in Science.} |
|
151 \begin{enumerate} |
|
152 \item Complete(100\%) data - Number of students who scored more than 90\% in Science |
|
153 \item Each slice - Number of students who scored more than 90\% in Science in one region |
|
154 \end{enumerate} |
|
155 \end{frame} |
|
156 |
|
157 \begin{frame} |
|
158 \frametitle{Problem statement: explanation} |
|
159 \emphbar{Draw a pie chart representing the proportion of students who scored more than 90\% in each subject across regions.} |
|
160 \begin{enumerate} |
|
161 \item Complete(100\%) data - Number of students who scored more than 90\% across all regions |
|
162 \item Each slice - Number of students who scored more than 90\% in each subject across all regions |
|
163 \end{enumerate} |
146 \end{frame} |
164 \end{frame} |
147 |
165 |
148 \begin{frame} |
166 \begin{frame} |
149 \frametitle{Statistical Analysis and Parsing \ldots} |
167 \frametitle{Statistical Analysis and Parsing \ldots} |
150 Machinery Required - |
168 Machinery Required - |
159 |
177 |
160 \begin{frame} |
178 \begin{frame} |
161 \frametitle{File reading and parsing} |
179 \frametitle{File reading and parsing} |
162 Understanding the structure of sslc1.txt |
180 Understanding the structure of sslc1.txt |
163 \begin{itemize} |
181 \begin{itemize} |
164 \item Each line in the file corresponds to one student's details |
182 \item One line in file corresponds to a student's details |
165 \item aka record |
183 \item aka record |
166 \item Each record consists of several fields separated by a ';' |
184 \item Each record consists of fields separated by ';' |
167 \end{itemize} |
185 \end{itemize} |
168 \end{frame} |
186 \end{frame} |
169 |
187 |
170 \begin{frame} |
188 \begin{frame} |
171 \frametitle{File reading and parsing \ldots} |
189 \frametitle{File reading and parsing \ldots} |
233 Out[6]: ["I'll be back", 42] |
251 Out[6]: ["I'll be back", 42] |
234 \end{lstlisting} |
252 \end{lstlisting} |
235 \end{frame} |
253 \end{frame} |
236 |
254 |
237 \begin{frame}[fragile] |
255 \begin{frame}[fragile] |
|
256 \frametitle{Back to lists: Iterating} |
|
257 \begin{itemize} |
|
258 \item Python's \kwrd{for} loop iterates through list items |
|
259 \item In other languages (C/C++) we run through indices and pick items from the array using these indices |
|
260 \item In Python, while iterating through list items current position is not available |
|
261 \end{itemize} |
|
262 \begin{block}{Iterating through indices} |
|
263 What if we want the index of an item of a list? |
|
264 \end{block} |
|
265 |
|
266 \end{frame} |
|
267 |
|
268 \begin{frame}[fragile] |
238 \frametitle{enumerate: Iterating through list indices} |
269 \frametitle{enumerate: Iterating through list indices} |
239 \begin{lstlisting} |
270 \begin{lstlisting} |
240 In [1]: names = ["Guido","Alex", "Tim"] |
271 In [1]: names = ["Guido","Alex", "Tim"] |
241 |
272 |
242 In [2]: for i, name in enumerate(names): |
273 In [2]: for i, name in enumerate(names): |
386 \begin{frame}[fragile] |
417 \begin{frame}[fragile] |
387 \frametitle{Pie charts} |
418 \frametitle{Pie charts} |
388 \includegraphics[height=3in, interpolate=true]{data/all_regions} |
419 \includegraphics[height=3in, interpolate=true]{data/all_regions} |
389 \end{frame} |
420 \end{frame} |
390 |
421 |
391 \subsection{Obtaining stastics} |
422 \subsection{Obtaining statistics} |
|
423 \begin{frame}[fragile] |
|
424 \frametitle{Obtaining statistics} |
|
425 \begin{block}{Statistics: Mean} |
|
426 Obtain the mean of Math scores |
|
427 \end{block} |
|
428 \end{frame} |
|
429 |
|
430 \begin{frame}[fragile] |
|
431 \frametitle{Obtaining statistics: Solution} |
|
432 \begin{block}{Statistics: Mean} |
|
433 Obtain the mean of Math scores |
|
434 \end{block} |
|
435 \begin{lstlisting} |
|
436 math_scores = scores[2] |
|
437 total = 0 |
|
438 for i, score in enumerate(math_scores): |
|
439 total += score |
|
440 |
|
441 mean = total / (i + 1) |
|
442 print "Mean: ", mean |
|
443 \end{lstlisting} |
|
444 \end{frame} |
|
445 |
|
446 \begin{frame}[fragile] |
|
447 \frametitle{Obtaining statistics: Another solution} |
|
448 \begin{block}{Statistics: Mean} |
|
449 Obtain the mean of Math scores |
|
450 \end{block} |
|
451 \begin{lstlisting} |
|
452 math_scores = scores[2] |
|
453 mean = sum(math_scores) / |
|
454 len(math_scores) |
|
455 \end{lstlisting} |
|
456 \end{frame} |
|
457 |
|
458 \begin{frame}[fragile] |
|
459 \frametitle{NumPy arrays} |
|
460 \begin{itemize} |
|
461 \item NumPy provides arrays |
|
462 \item arrays are very efficient and powerful |
|
463 \item Very easy to perform element-wise operations - \typ{+, -, *, /, \%} |
|
464 \begin{lstlisting} |
|
465 In [1]: a = array([1, 2, 3]) |
|
466 In [2]: b = array([4, 5, 6]) |
|
467 |
|
468 In [3]: a + b |
|
469 Out[3]: array([5, 7, 9]) |
|
470 \end{lstlisting} |
|
471 \item Very easy to compute statistics |
|
472 \end{itemize} |
|
473 \end{frame} |
|
474 |
392 \begin{frame}[fragile] |
475 \begin{frame}[fragile] |
393 \frametitle{Obtaining statistics} |
476 \frametitle{Obtaining statistics} |
394 \begin{lstlisting} |
477 \begin{lstlisting} |
395 math_scores = array(scores[2]) |
478 math_scores = array(scores[2]) |
396 |
479 |
414 \item NumPy arrays for efficient array manipulations |
497 \item NumPy arrays for efficient array manipulations |
415 \item Functions for statistical computations - mean, median, mode, standard deviation |
498 \item Functions for statistical computations - mean, median, mode, standard deviation |
416 \end{itemize} |
499 \end{itemize} |
417 \end{frame} |
500 \end{frame} |
418 |
501 |
|
502 \section{Least square fit} |
419 \begin{frame} |
503 \begin{frame} |
420 \frametitle{L vs $T^2$ \ldots} |
504 \frametitle{L vs $T^2$ \ldots} |
421 Let's go back to the L vs $T^2$ plot |
505 Let's go back to the L vs $T^2$ plot |
422 \begin{itemize} |
506 \begin{itemize} |
423 \item We first look at obtaining $T^2$ from T |
507 \item We first look at obtaining $T^2$ from T |
439 \begin{lstlisting} |
523 \begin{lstlisting} |
440 In []: L = array(L) |
524 In []: L = array(L) |
441 In []: T = array(T) |
525 In []: T = array(T) |
442 In []: TSq = T*T |
526 In []: TSq = T*T |
443 \end{lstlisting} |
527 \end{lstlisting} |
444 \end{frame} |
|
445 |
|
446 \begin{frame}[fragile] |
|
447 \frametitle{Arrays} |
|
448 \begin{itemize} |
|
449 \item \typ{T} and \typ{L} are now arrays |
|
450 \item arrays are very efficient and powerful |
|
451 \item Very easy to perform element-wise operations |
|
452 \item \typ{+, -, *, /, \%} |
|
453 \item More about arrays later |
|
454 \end{itemize} |
|
455 \end{frame} |
528 \end{frame} |
456 |
529 |
457 \begin{frame}[fragile] |
530 \begin{frame}[fragile] |
458 \frametitle{Least Squares Fit} |
531 \frametitle{Least Squares Fit} |
459 \vspace{-0.15in} |
532 \vspace{-0.15in} |
506 \end{bmatrix}$ |
579 \end{bmatrix}$ |
507 \item We need to find $p$ to plot the line |
580 \item We need to find $p$ to plot the line |
508 \end{itemize} |
581 \end{itemize} |
509 \end{frame} |
582 \end{frame} |
510 |
583 |
|
584 \subsection{Van der Monde matrix generation} |
511 \begin{frame}[fragile] |
585 \begin{frame}[fragile] |
512 \frametitle{Van der Monde Matrix} |
586 \frametitle{Van der Monde Matrix} |
513 \begin{itemize} |
587 \begin{itemize} |
514 \item A is also called a Van der Monde matrix |
588 \item A is also called a Van der Monde matrix |
515 \item It can be generated using \typ{vander} |
589 \item It can be generated using \typ{vander} |
537 \begin{lstlisting} |
611 \begin{lstlisting} |
538 In []: coef, res, r, s = lstsq(A,TSq) |
612 In []: coef, res, r, s = lstsq(A,TSq) |
539 \end{lstlisting} |
613 \end{lstlisting} |
540 \end{frame} |
614 \end{frame} |
541 |
615 |
|
616 \subsection{Plotting} |
542 \begin{frame}[fragile] |
617 \begin{frame}[fragile] |
543 \frametitle{Least Square Fit Line \ldots} |
618 \frametitle{Least Square Fit Line \ldots} |
544 We get the points of the line from \typ{coef} |
619 We get the points of the line from \typ{coef} |
545 \begin{lstlisting} |
620 \begin{lstlisting} |
546 In []: Tline = coef[0]*L + coef[1] |
621 In []: Tline = coef[0]*L + coef[1] |