71 % postbreak = \space\dots |
71 % postbreak = \space\dots |
72 % } |
72 % } |
73 |
73 |
74 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
74 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
75 % Title page |
75 % Title page |
76 \title[Statistics]{Python for Science and Engg: Statistics} |
76 \title[Statistics]{Python for Science and Engg:\\ Basic data processing} |
77 |
77 |
78 \author[FOSSEE] {FOSSEE} |
78 \author[FOSSEE] {FOSSEE} |
79 |
79 |
80 \institute[IIT Bombay] {Department of Aerospace Engineering\\IIT Bombay} |
80 \institute[IIT Bombay] {Department of Aerospace Engineering\\IIT Bombay} |
81 |
81 |
82 \date[] {30 April, 2010\\Day 1, Session 3} |
82 \date[] {SciPy 2010, Introductory tutorials,\\Day 1, Session 3} |
83 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
83 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
84 |
84 |
85 %\pgfdeclareimage[height=0.75cm]{iitmlogo}{iitmlogo} |
85 %\pgfdeclareimage[height=0.75cm]{iitmlogo}{iitmlogo} |
86 %\logo{\pgfuseimage{iitmlogo}} |
86 %\logo{\pgfuseimage{iitmlogo}} |
87 |
87 |
125 %% \frametitle{Outline} |
125 %% \frametitle{Outline} |
126 %% \tableofcontents |
126 %% \tableofcontents |
127 %% % You might wish to add the option [pausesections] |
127 %% % You might wish to add the option [pausesections] |
128 %% \end{frame} |
128 %% \end{frame} |
129 |
129 |
130 \section{Computing mean} |
130 \section{Computing the mean} |
131 \begin{frame} |
131 \begin{frame} |
132 \frametitle{Value of acceleration due to gravity?} |
132 \frametitle{Value of acceleration due to gravity?} |
133 \begin{itemize} |
133 \begin{itemize} |
134 \item We already have pendulum.txt |
134 \item We already have \typ{pendulum.txt} |
135 \item We know that $ T = 2\pi \sqrt{\frac{L}{g}} $ |
135 \item We know that $ T = 2\pi \sqrt{\frac{L}{g}} $ |
136 \item So $ g = \frac{4 \pi^2 L}{T^2} $ |
136 \item So $ g = \frac{4 \pi^2 L}{T^2} $ |
137 \item Calculate ``g'' - acceleration due to gravity for each pair of L and T |
137 \item Calculate $g$ - acceleration due to gravity for each pair of |
138 \item Hence calculate mean ``g'' |
138 $L$ and $T$ |
139 \end{itemize} |
139 \item Hence calculate mean $g$ |
140 \end{frame} |
140 \end{itemize} |
141 |
141 \end{frame} |
142 \begin{frame}[fragile] |
142 |
143 \frametitle{Acceleration due to gravity - ``g''\ldots} |
143 \begin{frame}[fragile] |
|
144 \frametitle{Acceleration due to gravity - $g$\ldots} |
144 \begin{lstlisting} |
145 \begin{lstlisting} |
145 In []: g_list = [] |
146 In []: g_list = [] |
146 In []: for line in open('pendulum.txt'): |
147 In []: for line in open('pendulum.txt'): |
147 .... point = line.split() |
148 .... point = line.split() |
148 .... L = float(point[0]) |
149 .... L = float(point[0]) |
164 In []: print 'Mean: ', g_mean |
165 In []: print 'Mean: ', g_mean |
165 \end{lstlisting} |
166 \end{lstlisting} |
166 \end{frame} |
167 \end{frame} |
167 |
168 |
168 \begin{frame}[fragile] |
169 \begin{frame}[fragile] |
169 \frametitle{Mean ``g'' - Slightly improved method} |
170 \frametitle{Mean $g$ - Slightly improved method} |
170 \begin{lstlisting} |
171 \begin{lstlisting} |
171 In []: g_mean = sum(g_list) / len(g_list) |
172 In []: g_mean = sum(g_list) / len(g_list) |
172 In []: print 'Mean: ', g_mean |
173 In []: print 'Mean: ', g_mean |
173 \end{lstlisting} |
174 \end{lstlisting} |
174 \end{frame} |
175 \end{frame} |
175 |
176 |
176 \begin{frame}[fragile] |
177 \begin{frame}[fragile] |
177 \frametitle{Mean ``g'' - One liner} |
178 \frametitle{Mean $g$ - One liner} |
178 \begin{lstlisting} |
179 \begin{lstlisting} |
179 In []: g_mean = mean(g_list) |
180 In []: g_mean = mean(g_list) |
180 In []: print 'Mean: ', g_mean |
181 In []: print 'Mean: ', g_mean |
181 \end{lstlisting} |
182 \end{lstlisting} |
182 \inctime{10} |
183 \inctime{10} |
184 |
185 |
185 \section{Processing voluminous data} |
186 \section{Processing voluminous data} |
186 \begin{frame} |
187 \begin{frame} |
187 \frametitle{More on data processing} |
188 \frametitle{More on data processing} |
188 \begin{block}{} |
189 \begin{block}{} |
189 We have a huge data file--180,000 records.\\How do we do \emph{efficient} statistical computations, i.e. find mean, median, standard deviation etc; draw pie charts? |
190 We have a huge data file--180,000 records.\\How do we do |
|
191 \emph{efficient} statistical computations, i.e. find mean, median, |
|
192 standard deviation etc.;\\How do we draw pie charts? |
190 \end{block} |
193 \end{block} |
191 \end{frame} |
194 \end{frame} |
192 |
195 |
193 \begin{frame} |
196 \begin{frame} |
194 \frametitle{Structure of the file} |
197 \frametitle{Structure of the file} |
195 Understanding the structure of sslc1.txt |
198 Understanding the structure of \typ{sslc1.txt} |
196 \begin{itemize} |
199 \begin{itemize} |
197 \item Each line in the file has a student's details(record) |
200 \item Each line in the file has a student's details(record) |
198 \item Each record consists of fields separated by ';' |
201 \item Each record consists of fields separated by ';' |
199 \end{itemize} |
202 \end{itemize} |
200 \emphbar{A;015162;JENIL T P;081;060;77;41;74;333;P;;} |
203 \emphbar{A;015162;JENIL T P;081;060;77;41;74;333;P;;} |
206 Each record consists of: |
209 Each record consists of: |
207 \begin{itemize} |
210 \begin{itemize} |
208 \item Region Code |
211 \item Region Code |
209 \item Roll Number |
212 \item Roll Number |
210 \item Name |
213 \item Name |
211 \item Marks of 5 subjects: SLang, Flang Maths, Science, Social |
214 \item Marks of 5 subjects: second lang, first lang., Math, Science, |
|
215 Social Studies |
212 \item Total marks |
216 \item Total marks |
213 \item Pass/Fail (P/F) |
217 \item Pass/Fail (P/F) |
214 \item Withheld (W) |
218 \item Withheld (W) |
215 \end{itemize} |
219 \end{itemize} |
216 \inctime{5} |
220 \inctime{5} |
217 \end{frame} |
221 \end{frame} |
218 |
222 |
219 \begin{frame} |
223 \begin{frame} |
220 \frametitle{Statistical Analysis: Problem statement} |
224 \frametitle{Statistical Analysis: Problem statement} |
221 1. Read the data supplied in the file \emph{sslc1.txt} and carry out the following: |
225 1. Read the data supplied in the file \typ{sslc1.txt} and carry out the following: |
222 \begin{itemize} |
226 \begin{itemize} |
223 \item[a] Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science. |
227 \item[a] Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science. |
224 \item[b] Print mean, median and standard deviation of math scores for all regions combined. |
228 \item[b] Print mean, median and standard deviation of math scores for all regions combined. |
225 \end{itemize} |
229 \end{itemize} |
226 \end{frame} |
230 \end{frame} |
334 \end{lstlisting} |
338 \end{lstlisting} |
335 \begin{itemize} |
339 \begin{itemize} |
336 \item Keys will be region codes |
340 \item Keys will be region codes |
337 \item Values will be the number students who scored more than 90\% in that region in Science |
341 \item Values will be the number students who scored more than 90\% in that region in Science |
338 \end{itemize} |
342 \end{itemize} |
339 \begin{block}{Sample \emph{science} dictionary} |
343 \begin{block}{Sample \typ{science} dictionary} |
340 \{'A': 729, 'C': 764, 'B': 1120,'E': 414, 'D': 603, 'F': 500\} |
344 \{'A': 729, 'C': 764, 'B': 1120,'E': 414, 'D': 603, 'F': 500\} |
341 \end{block} |
345 \end{block} |
342 |
346 |
343 \end{frame} |
347 \end{frame} |
344 |
348 |
456 \end{lstlisting} |
460 \end{lstlisting} |
457 \inctime{5} |
461 \inctime{5} |
458 \end{frame} |
462 \end{frame} |
459 |
463 |
460 \begin{frame}[fragile] |
464 \begin{frame}[fragile] |
|
465 \frametitle{IPython tip: Timing} |
|
466 |
|
467 Try the following: |
|
468 \begin{lstlisting} |
|
469 In []: %timeit mean(math_scores) |
|
470 |
|
471 In []: %timeit mean(math_array) |
|
472 |
|
473 In []: %timeit? |
|
474 |
|
475 \end{lstlisting} |
|
476 |
|
477 \begin{itemize} |
|
478 \item \typ{\%timeit}: accurate, many measurements |
|
479 \item Can also use \typ{\%time} |
|
480 \item \typ{\%time}: less accurate, one measurement |
|
481 \end{itemize} |
|
482 |
|
483 \inctime{5} |
|
484 \end{frame} |
|
485 |
|
486 \begin{frame}[fragile] |
461 \frametitle{What tools did we use?} |
487 \frametitle{What tools did we use?} |
462 \begin{itemize} |
488 \begin{itemize} |
|
489 \item More parsing data |
463 \item Dictionaries for storing data |
490 \item Dictionaries for storing data |
464 \item Facilities for drawing pie charts |
491 \item Facilities for drawing pie charts |
|
492 \item Functions for statistical computations - mean, median, standard deviation |
465 \item Efficient array manipulations |
493 \item Efficient array manipulations |
466 \item Functions for statistical computations - mean, median, standard deviation |
494 \item Timing in IPython |
467 \end{itemize} |
495 \end{itemize} |
|
496 |
468 \end{frame} |
497 \end{frame} |
469 |
498 |
470 \end{document} |
499 \end{document} |
471 |
500 |
472 %% Questions for Quiz %% |
501 %% Questions for Quiz %% |