1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
2 %Tutorial slides on Python. |
|
3 % |
|
4 % Author: FOSSEE |
|
5 % Copyright (c) 2009, FOSSEE, IIT Bombay |
|
6 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
7 |
|
8 \documentclass[14pt,compress]{beamer} |
|
9 %\documentclass[draft]{beamer} |
|
10 %\documentclass[compress,handout]{beamer} |
|
11 %\usepackage{pgfpages} |
|
12 %\pgfpagesuselayout{2 on 1}[a4paper,border shrink=5mm] |
|
13 |
|
14 % Modified from: generic-ornate-15min-45min.de.tex |
|
15 \mode<presentation> |
|
16 { |
|
17 \usetheme{Warsaw} |
|
18 \useoutertheme{infolines} |
|
19 \setbeamercovered{transparent} |
|
20 } |
|
21 |
|
22 \usepackage[english]{babel} |
|
23 \usepackage[latin1]{inputenc} |
|
24 %\usepackage{times} |
|
25 \usepackage[T1]{fontenc} |
|
26 |
|
27 % Taken from Fernando's slides. |
|
28 \usepackage{ae,aecompl} |
|
29 \usepackage{mathpazo,courier,euler} |
|
30 \usepackage[scaled=.95]{helvet} |
|
31 \usepackage{amsmath} |
|
32 |
|
33 \definecolor{darkgreen}{rgb}{0,0.5,0} |
|
34 |
|
35 \usepackage{listings} |
|
36 \lstset{language=Python, |
|
37 basicstyle=\ttfamily\bfseries, |
|
38 commentstyle=\color{red}\itshape, |
|
39 stringstyle=\color{darkgreen}, |
|
40 showstringspaces=false, |
|
41 keywordstyle=\color{blue}\bfseries} |
|
42 |
|
43 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
44 % Macros |
|
45 \setbeamercolor{emphbar}{bg=blue!20, fg=black} |
|
46 \newcommand{\emphbar}[1] |
|
47 {\begin{beamercolorbox}[rounded=true]{emphbar} |
|
48 {#1} |
|
49 \end{beamercolorbox} |
|
50 } |
|
51 \newcounter{time} |
|
52 \setcounter{time}{0} |
|
53 \newcommand{\inctime}[1]{\addtocounter{time}{#1}{\tiny \thetime\ m}} |
|
54 |
|
55 \newcommand{\typ}[1]{\lstinline{#1}} |
|
56 |
|
57 \newcommand{\kwrd}[1]{ \texttt{\textbf{\color{blue}{#1}}} } |
|
58 |
|
59 %%% This is from Fernando's setup. |
|
60 % \usepackage{color} |
|
61 % \definecolor{orange}{cmyk}{0,0.4,0.8,0.2} |
|
62 % % Use and configure listings package for nicely formatted code |
|
63 % \usepackage{listings} |
|
64 % \lstset{ |
|
65 % language=Python, |
|
66 % basicstyle=\small\ttfamily, |
|
67 % commentstyle=\ttfamily\color{blue}, |
|
68 % stringstyle=\ttfamily\color{orange}, |
|
69 % showstringspaces=false, |
|
70 % breaklines=true, |
|
71 % postbreak = \space\dots |
|
72 % } |
|
73 |
|
74 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
75 % Title page |
|
76 \title[Statistics]{Python for Science and Engg:\\ Basic data processing} |
|
77 |
|
78 \author[FOSSEE] {FOSSEE} |
|
79 |
|
80 \institute[IIT Bombay] {Department of Aerospace Engineering\\IIT Bombay} |
|
81 |
|
82 \date[] {SciPy 2010, Introductory tutorials,\\Day 1, Session 3} |
|
83 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
84 |
|
85 %\pgfdeclareimage[height=0.75cm]{iitmlogo}{iitmlogo} |
|
86 %\logo{\pgfuseimage{iitmlogo}} |
|
87 |
|
88 |
|
89 %% Delete this, if you do not want the table of contents to pop up at |
|
90 %% the beginning of each subsection: |
|
91 \AtBeginSubsection[] |
|
92 { |
|
93 \begin{frame}<beamer> |
|
94 \frametitle{Outline} |
|
95 \tableofcontents[currentsection,currentsubsection] |
|
96 \end{frame} |
|
97 } |
|
98 |
|
99 \AtBeginSection[] |
|
100 { |
|
101 \begin{frame}<beamer> |
|
102 \frametitle{Outline} |
|
103 \tableofcontents[currentsection,currentsubsection] |
|
104 \end{frame} |
|
105 } |
|
106 |
|
107 \newcommand{\num}{\texttt{numpy}} |
|
108 |
|
109 |
|
110 % If you wish to uncover everything in a step-wise fashion, uncomment |
|
111 % the following command: |
|
112 %\beamerdefaultoverlayspecification{<+->} |
|
113 |
|
114 %\includeonlyframes{current,current1,current2,current3,current4,current5,current6} |
|
115 |
|
116 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
117 % DOCUMENT STARTS |
|
118 \begin{document} |
|
119 |
|
120 \begin{frame} |
|
121 \maketitle |
|
122 \end{frame} |
|
123 |
|
124 %% \begin{frame} |
|
125 %% \frametitle{Outline} |
|
126 %% \tableofcontents |
|
127 %% % You might wish to add the option [pausesections] |
|
128 %% \end{frame} |
|
129 |
|
130 \section{Computing the mean} |
|
131 \begin{frame} |
|
132 \frametitle{Value of acceleration due to gravity?} |
|
133 \begin{itemize} |
|
134 \item We already have \typ{pendulum.txt} |
|
135 \item We know that $ T = 2\pi \sqrt{\frac{L}{g}} $ |
|
136 \item So $ g = \frac{4 \pi^2 L}{T^2} $ |
|
137 \item Calculate $g$ - acceleration due to gravity for each pair of |
|
138 $L$ and $T$ |
|
139 \item Hence calculate mean $g$ |
|
140 \end{itemize} |
|
141 \end{frame} |
|
142 |
|
143 \begin{frame}[fragile] |
|
144 \frametitle{Acceleration due to gravity - $g$\ldots} |
|
145 \begin{lstlisting} |
|
146 In []: g_list = [] |
|
147 In []: for line in open('pendulum.txt'): |
|
148 .... point = line.split() |
|
149 .... L = float(point[0]) |
|
150 .... t = float(point[1]) |
|
151 .... g = 4 * pi * pi * L / (t * t) |
|
152 .... g_list.append(g) |
|
153 \end{lstlisting} |
|
154 \end{frame} |
|
155 |
|
156 \begin{frame}[fragile] |
|
157 \frametitle{Mean $g$ - Classical method} |
|
158 \begin{lstlisting} |
|
159 In []: total = 0 |
|
160 In []: for g in g_list: |
|
161 ....: total += g |
|
162 ....: |
|
163 |
|
164 In []: g_mean = total / len(g_list) |
|
165 In []: print 'Mean: ', g_mean |
|
166 \end{lstlisting} |
|
167 \end{frame} |
|
168 |
|
169 \begin{frame}[fragile] |
|
170 \frametitle{Mean $g$ - Slightly improved method} |
|
171 \begin{lstlisting} |
|
172 In []: g_mean = sum(g_list) / len(g_list) |
|
173 In []: print 'Mean: ', g_mean |
|
174 \end{lstlisting} |
|
175 \end{frame} |
|
176 |
|
177 \begin{frame}[fragile] |
|
178 \frametitle{Mean $g$ - One liner} |
|
179 \begin{lstlisting} |
|
180 In []: g_mean = mean(g_list) |
|
181 In []: print 'Mean: ', g_mean |
|
182 \end{lstlisting} |
|
183 \inctime{10} |
|
184 \end{frame} |
|
185 |
|
186 \section{Processing voluminous data} |
|
187 \begin{frame} |
|
188 \frametitle{More on data processing} |
|
189 \begin{block}{} |
|
190 We have a huge data file--180,000 records.\\How do we do |
|
191 \emph{efficient} statistical computations, i.e. find mean, median, |
|
192 standard deviation etc.;\\How do we draw pie charts? |
|
193 \end{block} |
|
194 \end{frame} |
|
195 |
|
196 \begin{frame} |
|
197 \frametitle{Structure of the file} |
|
198 Understanding the structure of \typ{sslc1.txt} |
|
199 \begin{itemize} |
|
200 \item Each line in the file has a student's details(record) |
|
201 \item Each record consists of fields separated by ';' |
|
202 \end{itemize} |
|
203 \emphbar{A;015162;JENIL T P;081;060;77;41;74;333;P;;} |
|
204 \end{frame} |
|
205 |
|
206 \begin{frame} |
|
207 \frametitle{Structure of the file \ldots} |
|
208 \emphbar{A;015163;JOSEPH RAJ S;083;042;47;AA;72;244;;;} |
|
209 Each record consists of: |
|
210 \begin{itemize} |
|
211 \item Region Code |
|
212 \item Roll Number |
|
213 \item Name |
|
214 \item Marks of 5 subjects: second lang, first lang., Math, Science, |
|
215 Social Studies |
|
216 \item Total marks |
|
217 \item Pass/Fail (P/F) |
|
218 \item Withheld (W) |
|
219 \end{itemize} |
|
220 \inctime{5} |
|
221 \end{frame} |
|
222 |
|
223 \begin{frame} |
|
224 \frametitle{Statistical Analysis: Problem statement} |
|
225 1. Read the data supplied in the file \typ{sslc1.txt} and carry out the following: |
|
226 \begin{itemize} |
|
227 \item[a] Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science. |
|
228 \item[b] Print mean, median and standard deviation of math scores for all regions combined. |
|
229 \end{itemize} |
|
230 \end{frame} |
|
231 |
|
232 \begin{frame} |
|
233 \frametitle{Problem statement: explanation} |
|
234 \emphbar{a. Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science.} |
|
235 \begin{columns} |
|
236 \column{5.25\textwidth} |
|
237 \hspace*{.5in} |
|
238 \includegraphics[height=2.6in, interpolate=true]{data/science} |
|
239 \column{0.8\textwidth} |
|
240 \end{columns} |
|
241 \end{frame} |
|
242 |
|
243 \begin{frame} |
|
244 \frametitle{Machinery Required} |
|
245 \begin{itemize} |
|
246 \item File reading |
|
247 \item Parsing |
|
248 \item Dictionaries |
|
249 \item Arrays |
|
250 \item Statistical operations |
|
251 \end{itemize} |
|
252 \end{frame} |
|
253 |
|
254 \subsection{Data processing} |
|
255 \begin{frame}[fragile] |
|
256 \frametitle{File reading and parsing \ldots} |
|
257 \emphbar{Reading files line by line is the same as we had done with the pendulum example.} |
|
258 |
|
259 \begin{lstlisting} |
|
260 for record in open('sslc1.txt'): |
|
261 fields = record.split(';') |
|
262 \end{lstlisting} |
|
263 \end{frame} |
|
264 |
|
265 \subsection{Dictionaries} |
|
266 \begin{frame}[fragile] |
|
267 \frametitle{Dictionaries: Introduction} |
|
268 \begin{itemize} |
|
269 \item Lists index using integers\\ |
|
270 Recall \typ{p = [2, 3, 5, 7]} and\\ |
|
271 \typ{p[1]} is equal to \typ{3} |
|
272 \item Dictionaries index using strings |
|
273 \end{itemize} |
|
274 \end{frame} |
|
275 |
|
276 \begin{frame}[fragile] |
|
277 \frametitle{Dictionaries \ldots} |
|
278 \begin{lstlisting} |
|
279 In []: d = {'png' : 'image file', |
|
280 'txt' : 'text file', |
|
281 'py' : 'python code', |
|
282 'java': 'bad code', |
|
283 'cpp': 'complex code'} |
|
284 |
|
285 In []: d['txt'] |
|
286 Out[]: 'text file' |
|
287 \end{lstlisting} |
|
288 \end{frame} |
|
289 |
|
290 \begin{frame}[fragile] |
|
291 \frametitle{Dictionaries \ldots} |
|
292 \begin{lstlisting} |
|
293 In []: 'py' in d |
|
294 Out[]: True |
|
295 |
|
296 In []: 'jpg' in d |
|
297 Out[]: False |
|
298 \end{lstlisting} |
|
299 \end{frame} |
|
300 |
|
301 \begin{frame}[fragile] |
|
302 \frametitle{Dictionaries \ldots} |
|
303 \begin{small} |
|
304 \begin{lstlisting} |
|
305 In []: d.keys() |
|
306 Out[]: ['cpp', 'py', 'txt', 'java', 'png'] |
|
307 |
|
308 In []: d.values() |
|
309 Out[]: ['complex code', 'python code', |
|
310 'text file', 'bad code', |
|
311 'image file'] |
|
312 \end{lstlisting} |
|
313 \end{small} |
|
314 \inctime{10} |
|
315 \end{frame} |
|
316 |
|
317 \begin{frame}[fragile] |
|
318 \frametitle{Inserting elements into dictionary} |
|
319 \emphbar{\alert{d[key] = value}} |
|
320 \begin{lstlisting} |
|
321 In []: d['bin'] = 'binary file' |
|
322 In []: d |
|
323 Out[]: |
|
324 {'bin': 'binary file', |
|
325 'cpp': 'complex code', |
|
326 'java': 'bad code', |
|
327 'png': 'image file', |
|
328 'py': 'python code', |
|
329 'txt': 'text file'} |
|
330 \end{lstlisting} |
|
331 \end{frame} |
|
332 |
|
333 \begin{frame}[fragile] |
|
334 \frametitle{Getting back to the problem} |
|
335 Let our dictionary be: |
|
336 \begin{lstlisting} |
|
337 science = {} |
|
338 \end{lstlisting} |
|
339 \begin{itemize} |
|
340 \item Keys will be region codes |
|
341 \item Values will be the number students who scored more than 90\% in that region in Science |
|
342 \end{itemize} |
|
343 \begin{block}{Sample \typ{science} dictionary} |
|
344 \{'A': 729, 'C': 764, 'B': 1120,'E': 414, 'D': 603, 'F': 500\} |
|
345 \end{block} |
|
346 |
|
347 \end{frame} |
|
348 |
|
349 \begin{frame}[fragile] |
|
350 \frametitle{Building parsed data \ldots} |
|
351 \begin{lstlisting} |
|
352 science = {} |
|
353 |
|
354 for record in open('sslc1.txt'): |
|
355 fields = record.split(';') |
|
356 |
|
357 region_code = fields[0].strip() |
|
358 \end{lstlisting} |
|
359 \end{frame} |
|
360 |
|
361 \begin{frame}[fragile] |
|
362 \frametitle{Building parsed data \ldots} |
|
363 \begin{lstlisting} |
|
364 if region_code not in science: |
|
365 science[region_code] = 0 |
|
366 |
|
367 score_str = fields[6].strip() |
|
368 |
|
369 score = 0 |
|
370 if score_str != 'AA': |
|
371 score = int(score_str) |
|
372 |
|
373 if score > 90: |
|
374 science[region_code] += 1 |
|
375 \end{lstlisting} |
|
376 \end{frame} |
|
377 |
|
378 \begin{frame}[fragile] |
|
379 \frametitle{Building parsed data \ldots} |
|
380 \begin{lstlisting} |
|
381 print science |
|
382 print science.keys() |
|
383 print science.values() |
|
384 \end{lstlisting} |
|
385 \end{frame} |
|
386 |
|
387 \subsection{Visualizing data} |
|
388 \begin{frame}[fragile] |
|
389 \frametitle{Pie Chart} |
|
390 \begin{lstlisting} |
|
391 pie(science.values()) |
|
392 \end{lstlisting} |
|
393 \includegraphics[height=2in, interpolate=true]{data/science_nolabel} |
|
394 \end{frame} |
|
395 |
|
396 \begin{frame}[fragile] |
|
397 \frametitle{Pie chart} |
|
398 \small |
|
399 \begin{lstlisting} |
|
400 pie(science.values(), |
|
401 labels = science.keys()) |
|
402 title('Students scoring 90% and above |
|
403 in science by region') |
|
404 savefig('science.png') |
|
405 \end{lstlisting} |
|
406 \begin{columns} |
|
407 \column{5.25\textwidth} |
|
408 \hspace*{1.1in} |
|
409 \includegraphics[height=2in, interpolate=true]{data/science} |
|
410 \column{0.8\textwidth} |
|
411 \end{columns} |
|
412 \inctime{10} |
|
413 \end{frame} |
|
414 |
|
415 \begin{frame} |
|
416 \frametitle{Problem statement} |
|
417 \emphbar{b. Print mean, median and standard deviation of math scores for all regions combined.} |
|
418 \end{frame} |
|
419 |
|
420 \begin{frame}[fragile] |
|
421 \frametitle{Building data for statistics} |
|
422 \begin{lstlisting} |
|
423 math_scores = [] |
|
424 |
|
425 for record in open('sslc1.txt'): |
|
426 fields = record.split(';') |
|
427 |
|
428 score_str = fields[5].strip() |
|
429 score = 0 |
|
430 if score_str != 'AA': |
|
431 score = int(score_str) |
|
432 |
|
433 math_scores.append(score) |
|
434 \end{lstlisting} |
|
435 \end{frame} |
|
436 |
|
437 \subsection{Obtaining statistics} |
|
438 \begin{frame}[fragile] |
|
439 \frametitle{Obtaining statistics} |
|
440 \begin{lstlisting} |
|
441 print 'Mean: ', mean(math_scores) |
|
442 |
|
443 print 'Median: ', median(math_scores) |
|
444 |
|
445 print 'Standard Deviation: ', |
|
446 std(math_scores) |
|
447 \end{lstlisting} |
|
448 \inctime{10} |
|
449 \end{frame} |
|
450 |
|
451 \begin{frame}[fragile] |
|
452 \frametitle{Obtaining statistics: efficiently!} |
|
453 \begin{lstlisting} |
|
454 math_array = array(math_scores) |
|
455 |
|
456 print 'Mean: ', mean(math_array) |
|
457 |
|
458 print 'Median: ', median(math_array) |
|
459 |
|
460 print 'Standard Deviation: ', |
|
461 std(math_array) |
|
462 \end{lstlisting} |
|
463 \inctime{5} |
|
464 \end{frame} |
|
465 |
|
466 \begin{frame}[fragile] |
|
467 \frametitle{IPython tip: Timing} |
|
468 |
|
469 Try the following: |
|
470 \begin{lstlisting} |
|
471 In []: %timeit mean(math_scores) |
|
472 |
|
473 In []: %timeit mean(math_array) |
|
474 |
|
475 In []: %timeit? |
|
476 |
|
477 \end{lstlisting} |
|
478 |
|
479 \begin{itemize} |
|
480 \item \typ{\%timeit}: accurate, many measurements |
|
481 \item Can also use \typ{\%time} |
|
482 \item \typ{\%time}: less accurate, one measurement |
|
483 \end{itemize} |
|
484 |
|
485 \inctime{5} |
|
486 \end{frame} |
|
487 |
|
488 \begin{frame}[fragile] |
|
489 \frametitle{What tools did we use?} |
|
490 \begin{itemize} |
|
491 \item More parsing data |
|
492 \item Dictionaries for storing data |
|
493 \item Facilities for drawing pie charts |
|
494 \item Functions for statistical computations - mean, median, standard deviation |
|
495 \item Efficient array manipulations |
|
496 \item Timing in IPython |
|
497 \end{itemize} |
|
498 |
|
499 \end{frame} |
|
500 |
|
501 \end{document} |
|
502 |
|
503 %% Questions for Quiz %% |
|
504 %% ------------------ %% |
|
505 |
|
506 \begin{frame} |
|
507 \frametitle{\incqno } |
|
508 A sample line from a Comma Separated Values (CSV) file:\\ |
|
509 \vspace*{0.2in} |
|
510 \emph{Rossum, Guido, 42, 56, 34, 54}\\ |
|
511 \vspace*{0.2in} |
|
512 What code would you use to separate the line into fields? |
|
513 \end{frame} |
|
514 |
|
515 \begin{frame}[fragile] |
|
516 \frametitle{\incqno } |
|
517 \begin{lstlisting} |
|
518 In []: a = [1, 2, 5, 9] |
|
519 \end{lstlisting} |
|
520 How do you find the length of this list? |
|
521 \end{frame} |
|
522 |
|
523 \begin{frame}[fragile] |
|
524 \frametitle{\incqno } |
|
525 \begin{lstlisting} |
|
526 In [1]: d = { |
|
527 'a': 1, |
|
528 'b': 2 |
|
529 } |
|
530 In [2]: print d['c'] |
|
531 \end{lstlisting} |
|
532 What is the output? |
|
533 \end{frame} |
|
534 |
|
535 \begin{frame}[fragile] |
|
536 \frametitle{\incqno } |
|
537 \begin{lstlisting} |
|
538 In []: sc = {'A': 10, 'B': 20, |
|
539 'C': 70} |
|
540 \end{lstlisting} |
|
541 Given the above dictionary, what command will you give to plot a |
|
542 pie-chart? |
|
543 \end{frame} |
|
544 |
|
545 \begin{frame}[fragile] |
|
546 \frametitle{\incqno } |
|
547 \begin{lstlisting} |
|
548 In []: marks = [10, 20, 30, 50, 55, |
|
549 75, 83] |
|
550 \end{lstlisting} |
|
551 Given the above marks, how will you calculate the \alert{mean} and |
|
552 \alert{standard deviation}? |
|
553 \end{frame} |
|
554 |
|
555 \begin{frame}[fragile] |
|
556 \frametitle{\incqno } |
|
557 \begin{lstlisting} |
|
558 In []: marks = [10, 20, 30, 50, 55, |
|
559 75, 83] |
|
560 \end{lstlisting} |
|
561 How will you convert the list \texttt{marks} to an \alert{array}? |
|
562 \end{frame} |
|
563 |
|
564 %% \begin{frame}[fragile] |
|
565 %% \frametitle{\incqno } |
|
566 %% \begin{lstlisting} |
|
567 %% for x in "abcd": |
|
568 %% print x |
|
569 |
|
570 %% a |
|
571 %% b |
|
572 %% c |
|
573 %% d |
|
574 %% \end{lstlisting} |
|
575 %% How do you get the following output? |
|
576 %% \begin{lstlisting} |
|
577 %% 0 a |
|
578 %% 1 b |
|
579 %% 2 c |
|
580 %% 3 d |
|
581 %% \end{lstlisting} |
|
582 %% \end{frame} |
|
583 |
|