day1/session3.tex
branchscipyin2010
changeset 442 7c5431fa2d46
parent 441 9d9e4026238f
child 443 ca37cf69cd18
equal deleted inserted replaced
441:9d9e4026238f 442:7c5431fa2d46
     1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
     2 %Tutorial slides on Python.
       
     3 %
       
     4 % Author: FOSSEE
       
     5 % Copyright (c) 2009, FOSSEE, IIT Bombay
       
     6 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
     7 
       
     8 \documentclass[14pt,compress]{beamer}
       
     9 %\documentclass[draft]{beamer}
       
    10 %\documentclass[compress,handout]{beamer}
       
    11 %\usepackage{pgfpages} 
       
    12 %\pgfpagesuselayout{2 on 1}[a4paper,border shrink=5mm]
       
    13 
       
    14 % Modified from: generic-ornate-15min-45min.de.tex
       
    15 \mode<presentation>
       
    16 {
       
    17   \usetheme{Warsaw}
       
    18   \useoutertheme{infolines}
       
    19   \setbeamercovered{transparent}
       
    20 }
       
    21 
       
    22 \usepackage[english]{babel}
       
    23 \usepackage[latin1]{inputenc}
       
    24 %\usepackage{times}
       
    25 \usepackage[T1]{fontenc}
       
    26 
       
    27 % Taken from Fernando's slides.
       
    28 \usepackage{ae,aecompl}
       
    29 \usepackage{mathpazo,courier,euler}
       
    30 \usepackage[scaled=.95]{helvet}
       
    31 \usepackage{amsmath}
       
    32 
       
    33 \definecolor{darkgreen}{rgb}{0,0.5,0}
       
    34 
       
    35 \usepackage{listings}
       
    36 \lstset{language=Python,
       
    37     basicstyle=\ttfamily\bfseries,
       
    38     commentstyle=\color{red}\itshape,
       
    39   stringstyle=\color{darkgreen},
       
    40   showstringspaces=false,
       
    41   keywordstyle=\color{blue}\bfseries}
       
    42 
       
    43 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
    44 % Macros
       
    45 \setbeamercolor{emphbar}{bg=blue!20, fg=black}
       
    46 \newcommand{\emphbar}[1]
       
    47 {\begin{beamercolorbox}[rounded=true]{emphbar} 
       
    48       {#1}
       
    49  \end{beamercolorbox}
       
    50 }
       
    51 \newcounter{time}
       
    52 \setcounter{time}{0}
       
    53 \newcommand{\inctime}[1]{\addtocounter{time}{#1}{\tiny \thetime\ m}}
       
    54 
       
    55 \newcommand{\typ}[1]{\lstinline{#1}}
       
    56 
       
    57 \newcommand{\kwrd}[1]{ \texttt{\textbf{\color{blue}{#1}}}  }
       
    58 
       
    59 %%% This is from Fernando's setup.
       
    60 % \usepackage{color}
       
    61 % \definecolor{orange}{cmyk}{0,0.4,0.8,0.2}
       
    62 % % Use and configure listings package for nicely formatted code
       
    63 % \usepackage{listings}
       
    64 % \lstset{
       
    65 %    language=Python,
       
    66 %    basicstyle=\small\ttfamily,
       
    67 %    commentstyle=\ttfamily\color{blue},
       
    68 %    stringstyle=\ttfamily\color{orange},
       
    69 %    showstringspaces=false,
       
    70 %    breaklines=true,
       
    71 %    postbreak = \space\dots
       
    72 % }
       
    73 
       
    74 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
    75 % Title page
       
    76 \title[Statistics]{Python for Science and Engg:\\ Basic data processing}
       
    77 
       
    78 \author[FOSSEE] {FOSSEE}
       
    79 
       
    80 \institute[IIT Bombay] {Department of Aerospace Engineering\\IIT Bombay}
       
    81 
       
    82 \date[] {SciPy 2010, Introductory tutorials,\\Day 1, Session 3}
       
    83 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
    84 
       
    85 %\pgfdeclareimage[height=0.75cm]{iitmlogo}{iitmlogo}
       
    86 %\logo{\pgfuseimage{iitmlogo}}
       
    87 
       
    88 
       
    89 %% Delete this, if you do not want the table of contents to pop up at
       
    90 %% the beginning of each subsection:
       
    91 \AtBeginSubsection[]
       
    92 {
       
    93   \begin{frame}<beamer>
       
    94     \frametitle{Outline}
       
    95     \tableofcontents[currentsection,currentsubsection]
       
    96   \end{frame}
       
    97 }
       
    98 
       
    99 \AtBeginSection[]
       
   100 {
       
   101   \begin{frame}<beamer>
       
   102     \frametitle{Outline}
       
   103     \tableofcontents[currentsection,currentsubsection]
       
   104   \end{frame}
       
   105 }
       
   106 
       
   107 \newcommand{\num}{\texttt{numpy}}
       
   108 
       
   109 
       
   110 % If you wish to uncover everything in a step-wise fashion, uncomment
       
   111 % the following command: 
       
   112 %\beamerdefaultoverlayspecification{<+->}
       
   113 
       
   114 %\includeonlyframes{current,current1,current2,current3,current4,current5,current6}
       
   115 
       
   116 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   117 % DOCUMENT STARTS
       
   118 \begin{document}
       
   119 
       
   120 \begin{frame}
       
   121   \maketitle
       
   122 \end{frame}
       
   123 
       
   124 %% \begin{frame}
       
   125 %%   \frametitle{Outline}
       
   126 %%   \tableofcontents
       
   127 %%   % You might wish to add the option [pausesections]
       
   128 %% \end{frame}
       
   129 
       
   130 \section{Computing the mean}
       
   131 \begin{frame}
       
   132   \frametitle{Value of acceleration due to gravity?}
       
   133   \begin{itemize}
       
   134     \item We already have \typ{pendulum.txt}
       
   135     \item We know that $ T = 2\pi \sqrt{\frac{L}{g}} $
       
   136     \item So $ g = \frac{4 \pi^2 L}{T^2}  $
       
   137     \item Calculate $g$ - acceleration due to gravity for each pair of
       
   138         $L$ and $T$
       
   139     \item Hence calculate mean $g$
       
   140   \end{itemize}
       
   141 \end{frame}
       
   142 
       
   143 \begin{frame}[fragile]
       
   144   \frametitle{Acceleration due to gravity - $g$\ldots}
       
   145   \begin{lstlisting}
       
   146 In []: g_list = []
       
   147 In []: for line in open('pendulum.txt'):
       
   148   ....     point = line.split()
       
   149   ....     L = float(point[0])
       
   150   ....     t = float(point[1])
       
   151   ....     g = 4 * pi * pi * L / (t * t)
       
   152   ....     g_list.append(g)
       
   153   \end{lstlisting}
       
   154 \end{frame}
       
   155 
       
   156 \begin{frame}[fragile]
       
   157   \frametitle{Mean $g$ - Classical method}
       
   158   \begin{lstlisting}
       
   159 In []: total = 0
       
   160 In []: for g in g_list:
       
   161  ....:     total += g
       
   162  ....:
       
   163 
       
   164 In []: g_mean = total / len(g_list)
       
   165 In []: print 'Mean: ', g_mean
       
   166   \end{lstlisting}
       
   167 \end{frame}
       
   168 
       
   169 \begin{frame}[fragile]
       
   170   \frametitle{Mean $g$ - Slightly improved method}
       
   171   \begin{lstlisting}
       
   172 In []: g_mean = sum(g_list) / len(g_list)
       
   173 In []: print 'Mean: ', g_mean
       
   174   \end{lstlisting}
       
   175 \end{frame}
       
   176 
       
   177 \begin{frame}[fragile]
       
   178   \frametitle{Mean $g$ - One liner}
       
   179   \begin{lstlisting}
       
   180 In []: g_mean = mean(g_list)
       
   181 In []: print 'Mean: ', g_mean
       
   182   \end{lstlisting}
       
   183   \inctime{10}
       
   184 \end{frame}
       
   185 
       
   186 \section{Processing voluminous data}
       
   187 \begin{frame}
       
   188   \frametitle{More on data processing}
       
   189   \begin{block}{}
       
   190     We have a huge data file--180,000 records.\\How do we do
       
   191     \emph{efficient} statistical computations, i.e. find mean, median,
       
   192     standard deviation etc.;\\How do we draw pie charts?
       
   193   \end{block}
       
   194 \end{frame}
       
   195 
       
   196 \begin{frame}
       
   197   \frametitle{Structure of the file}
       
   198   Understanding the structure of \typ{sslc1.txt}
       
   199   \begin{itemize}
       
   200     \item Each line in the file has a student's details(record)
       
   201     \item Each record consists of fields separated by ';'
       
   202   \end{itemize}
       
   203 \emphbar{A;015162;JENIL T P;081;060;77;41;74;333;P;;}
       
   204 \end{frame}
       
   205 
       
   206 \begin{frame}
       
   207   \frametitle{Structure of the file \ldots}
       
   208 \emphbar{A;015163;JOSEPH RAJ S;083;042;47;AA;72;244;;;}
       
   209   Each record consists of:
       
   210   \begin{itemize}
       
   211     \item Region Code
       
   212     \item Roll Number
       
   213     \item Name
       
   214     \item Marks of 5 subjects: second lang, first lang., Math, Science,
       
   215         Social Studies
       
   216     \item Total marks
       
   217     \item Pass/Fail (P/F)
       
   218     \item Withheld (W)
       
   219   \end{itemize}
       
   220   \inctime{5}
       
   221 \end{frame}
       
   222 
       
   223 \begin{frame}
       
   224   \frametitle{Statistical Analysis: Problem statement}
       
   225   1. Read the data supplied in the file \typ{sslc1.txt} and carry out the following:
       
   226   \begin{itemize}
       
   227     \item[a] Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science.
       
   228     \item[b] Print mean, median and standard deviation of math scores for all regions combined.
       
   229   \end{itemize}
       
   230 \end{frame}
       
   231 
       
   232 \begin{frame}
       
   233   \frametitle{Problem statement: explanation}
       
   234     \emphbar{a. Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science.}
       
   235 \begin{columns}
       
   236     \column{5.25\textwidth}
       
   237     \hspace*{.5in}
       
   238 \includegraphics[height=2.6in, interpolate=true]{data/science}
       
   239     \column{0.8\textwidth}
       
   240 \end{columns}
       
   241 \end{frame}
       
   242 
       
   243 \begin{frame}
       
   244   \frametitle{Machinery Required}
       
   245   \begin{itemize}
       
   246     \item File reading
       
   247     \item Parsing
       
   248     \item Dictionaries 
       
   249     \item Arrays
       
   250     \item Statistical operations
       
   251   \end{itemize}
       
   252 \end{frame}
       
   253 
       
   254 \subsection{Data processing}
       
   255 \begin{frame}[fragile]
       
   256   \frametitle{File reading and parsing \ldots}
       
   257 \emphbar{Reading files line by line is the same as we had done with the pendulum example.}
       
   258 
       
   259   \begin{lstlisting}
       
   260 for record in open('sslc1.txt'):
       
   261     fields = record.split(';')
       
   262   \end{lstlisting}
       
   263 \end{frame}
       
   264 
       
   265 \subsection{Dictionaries}
       
   266 \begin{frame}[fragile]
       
   267   \frametitle{Dictionaries: Introduction}
       
   268   \begin{itemize}
       
   269     \item Lists index using integers\\
       
   270 Recall \typ{p = [2, 3, 5, 7]} and\\
       
   271 \typ{p[1]} is equal to \typ{3}
       
   272     \item Dictionaries index using strings
       
   273   \end{itemize}
       
   274 \end{frame}
       
   275 
       
   276 \begin{frame}[fragile]
       
   277   \frametitle{Dictionaries \ldots}
       
   278   \begin{lstlisting}
       
   279 In []: d = {'png' : 'image file',
       
   280       'txt' : 'text file', 
       
   281       'py' : 'python code',
       
   282       'java': 'bad code', 
       
   283       'cpp': 'complex code'}
       
   284 
       
   285 In []: d['txt']
       
   286 Out[]: 'text file'
       
   287   \end{lstlisting}
       
   288 \end{frame}
       
   289 
       
   290 \begin{frame}[fragile]
       
   291   \frametitle{Dictionaries \ldots}
       
   292   \begin{lstlisting}
       
   293 In []: 'py' in d
       
   294 Out[]: True
       
   295 
       
   296 In []: 'jpg' in d
       
   297 Out[]: False
       
   298   \end{lstlisting}
       
   299 \end{frame}
       
   300 
       
   301 \begin{frame}[fragile]
       
   302   \frametitle{Dictionaries \ldots}
       
   303   \begin{small}
       
   304     \begin{lstlisting}
       
   305 In []: d.keys()
       
   306 Out[]: ['cpp', 'py', 'txt', 'java', 'png']
       
   307 
       
   308 In []: d.values()
       
   309 Out[]: ['complex code', 'python code',
       
   310         'text file', 'bad code', 
       
   311         'image file']
       
   312     \end{lstlisting}
       
   313   \end{small}
       
   314   \inctime{10}
       
   315 \end{frame}
       
   316 
       
   317 \begin{frame}[fragile]
       
   318   \frametitle{Inserting elements into dictionary}
       
   319   \emphbar{\alert{d[key] = value}}
       
   320   \begin{lstlisting}
       
   321     In []: d['bin'] = 'binary file'
       
   322     In []: d
       
   323     Out[]: 
       
   324     {'bin': 'binary file',
       
   325      'cpp': 'complex code',
       
   326      'java': 'bad code',
       
   327      'png': 'image file',
       
   328      'py': 'python code',
       
   329      'txt': 'text file'}
       
   330   \end{lstlisting}
       
   331 \end{frame}
       
   332 
       
   333 \begin{frame}[fragile]
       
   334   \frametitle{Getting back to the problem}
       
   335   Let our dictionary be:
       
   336   \begin{lstlisting}
       
   337 science = {}
       
   338   \end{lstlisting}
       
   339 \begin{itemize}
       
   340     \item Keys will be region codes
       
   341     \item Values will be the number students who scored more than 90\% in that region in Science
       
   342   \end{itemize}
       
   343   \begin{block}{Sample \typ{science} dictionary}
       
   344     \{'A': 729, 'C': 764, 'B': 1120,'E': 414, 'D': 603, 'F': 500\}
       
   345   \end{block}
       
   346 
       
   347 \end{frame}
       
   348 
       
   349 \begin{frame}[fragile]
       
   350   \frametitle{Building parsed data \ldots}
       
   351   \begin{lstlisting}
       
   352 science = {}
       
   353 
       
   354 for record in open('sslc1.txt'):
       
   355     fields = record.split(';')
       
   356 
       
   357     region_code = fields[0].strip()
       
   358   \end{lstlisting}
       
   359 \end{frame}
       
   360 
       
   361 \begin{frame}[fragile]
       
   362   \frametitle{Building parsed data \ldots}
       
   363   \begin{lstlisting}
       
   364     if region_code not in science:
       
   365         science[region_code] = 0
       
   366 
       
   367     score_str = fields[6].strip()
       
   368 
       
   369     score = 0
       
   370     if score_str != 'AA':
       
   371         score = int(score_str)
       
   372 
       
   373     if score > 90:
       
   374         science[region_code] += 1
       
   375   \end{lstlisting}
       
   376 \end{frame}
       
   377 
       
   378 \begin{frame}[fragile]
       
   379   \frametitle{Building parsed data \ldots}
       
   380   \begin{lstlisting}
       
   381 print science
       
   382 print science.keys()
       
   383 print science.values()
       
   384   \end{lstlisting}
       
   385 \end{frame}
       
   386 
       
   387 \subsection{Visualizing data}
       
   388 \begin{frame}[fragile]
       
   389   \frametitle{Pie Chart}
       
   390   \begin{lstlisting}
       
   391     pie(science.values())
       
   392   \end{lstlisting}
       
   393 \includegraphics[height=2in, interpolate=true]{data/science_nolabel}
       
   394 \end{frame}
       
   395 
       
   396 \begin{frame}[fragile]
       
   397   \frametitle{Pie chart}
       
   398   \small
       
   399   \begin{lstlisting}
       
   400 pie(science.values(), 
       
   401     labels = science.keys())
       
   402 title('Students scoring 90% and above 
       
   403       in science by region')
       
   404 savefig('science.png')
       
   405   \end{lstlisting}
       
   406 \begin{columns}
       
   407     \column{5.25\textwidth}
       
   408     \hspace*{1.1in}
       
   409 \includegraphics[height=2in, interpolate=true]{data/science}
       
   410     \column{0.8\textwidth}
       
   411 \end{columns}
       
   412   \inctime{10}
       
   413 \end{frame}
       
   414 
       
   415 \begin{frame}
       
   416   \frametitle{Problem statement}
       
   417     \emphbar{b. Print mean, median and standard deviation of math scores for all regions combined.}
       
   418 \end{frame}
       
   419 
       
   420 \begin{frame}[fragile]
       
   421   \frametitle{Building data for statistics}
       
   422   \begin{lstlisting}
       
   423 math_scores = []
       
   424 
       
   425 for record in open('sslc1.txt'):
       
   426     fields = record.split(';')
       
   427 
       
   428     score_str = fields[5].strip()
       
   429     score = 0
       
   430     if score_str != 'AA':
       
   431         score = int(score_str)
       
   432 
       
   433     math_scores.append(score)
       
   434   \end{lstlisting}
       
   435 \end{frame}
       
   436 
       
   437 \subsection{Obtaining statistics}
       
   438 \begin{frame}[fragile]
       
   439   \frametitle{Obtaining statistics}
       
   440   \begin{lstlisting}
       
   441 print 'Mean: ', mean(math_scores)
       
   442 
       
   443 print 'Median: ', median(math_scores)
       
   444 
       
   445 print 'Standard Deviation: ',
       
   446               std(math_scores)
       
   447   \end{lstlisting}
       
   448   \inctime{10}
       
   449 \end{frame}
       
   450 
       
   451 \begin{frame}[fragile]
       
   452   \frametitle{Obtaining statistics: efficiently!}
       
   453   \begin{lstlisting}
       
   454 math_array = array(math_scores)
       
   455 
       
   456 print 'Mean: ', mean(math_array)
       
   457 
       
   458 print 'Median: ', median(math_array)
       
   459 
       
   460 print 'Standard Deviation: ',
       
   461               std(math_array)
       
   462   \end{lstlisting}
       
   463   \inctime{5}
       
   464 \end{frame}
       
   465 
       
   466 \begin{frame}[fragile]
       
   467   \frametitle{IPython tip: Timing}
       
   468 
       
   469 Try the following:
       
   470   \begin{lstlisting}
       
   471 In []: %timeit mean(math_scores)
       
   472 
       
   473 In []: %timeit mean(math_array)
       
   474 
       
   475 In []: %timeit?
       
   476 
       
   477   \end{lstlisting}
       
   478 
       
   479   \begin{itemize}
       
   480       \item \typ{\%timeit}: accurate, many measurements
       
   481       \item Can also use \typ{\%time}
       
   482       \item \typ{\%time}: less accurate, one measurement 
       
   483   \end{itemize}
       
   484 
       
   485   \inctime{5}
       
   486 \end{frame}
       
   487 
       
   488 \begin{frame}[fragile]
       
   489   \frametitle{What tools did we use?}
       
   490   \begin{itemize}
       
   491    \item More parsing data
       
   492    \item Dictionaries for storing data
       
   493    \item Facilities for drawing pie charts
       
   494    \item Functions for statistical computations - mean, median, standard deviation
       
   495    \item Efficient array manipulations
       
   496    \item Timing in IPython
       
   497   \end{itemize}
       
   498 
       
   499 \end{frame}
       
   500 
       
   501 \end{document}
       
   502 
       
   503 %% Questions for Quiz %%
       
   504 %% ------------------ %%
       
   505 
       
   506 \begin{frame}
       
   507 \frametitle{\incqno }
       
   508   A sample line from a Comma Separated Values (CSV) file:\\
       
   509   \vspace*{0.2in}
       
   510   \emph{Rossum, Guido, 42, 56, 34, 54}\\
       
   511   \vspace*{0.2in}
       
   512   What code would you use to separate the line into fields?
       
   513 \end{frame}
       
   514 
       
   515 \begin{frame}[fragile]
       
   516 \frametitle{\incqno }
       
   517   \begin{lstlisting}
       
   518   In []: a = [1, 2, 5, 9]
       
   519   \end{lstlisting}
       
   520   How do you find the length of this list?
       
   521 \end{frame}
       
   522 
       
   523 \begin{frame}[fragile]
       
   524 \frametitle{\incqno }
       
   525   \begin{lstlisting}
       
   526   In [1]: d = {
       
   527           'a': 1,
       
   528           'b': 2
       
   529           }
       
   530   In [2]: print d['c']
       
   531   \end{lstlisting}
       
   532   What is the output?
       
   533 \end{frame}
       
   534 
       
   535 \begin{frame}[fragile]
       
   536 \frametitle{\incqno }
       
   537 \begin{lstlisting}
       
   538 In []: sc = {'A': 10, 'B': 20, 
       
   539              'C': 70}
       
   540 \end{lstlisting}
       
   541 Given the above dictionary, what command will you give to plot a
       
   542 pie-chart?
       
   543 \end{frame}
       
   544 
       
   545 \begin{frame}[fragile]
       
   546 \frametitle{\incqno }
       
   547 \begin{lstlisting}
       
   548 In []: marks = [10, 20, 30, 50, 55, 
       
   549                 75, 83] 
       
   550 \end{lstlisting}
       
   551 Given the above marks, how will you calculate the \alert{mean} and
       
   552 \alert{standard deviation}?
       
   553 \end{frame}
       
   554 
       
   555 \begin{frame}[fragile]
       
   556 \frametitle{\incqno }
       
   557 \begin{lstlisting}
       
   558 In []: marks = [10, 20, 30, 50, 55, 
       
   559                75, 83] 
       
   560 \end{lstlisting}
       
   561 How will you convert the list \texttt{marks} to an \alert{array}?
       
   562 \end{frame}
       
   563 
       
   564 %% \begin{frame}[fragile]
       
   565 %% \frametitle{\incqno }
       
   566 %%   \begin{lstlisting}
       
   567 %%   for x in "abcd":
       
   568 %%       print x
       
   569 
       
   570 %%   a
       
   571 %%   b
       
   572 %%   c
       
   573 %%   d
       
   574 %%   \end{lstlisting}
       
   575 %%   How do you get the following output? 
       
   576 %%   \begin{lstlisting}
       
   577 %%     0 a
       
   578 %%     1 b
       
   579 %%     2 c
       
   580 %%     3 d
       
   581 %%   \end{lstlisting}
       
   582 %% \end{frame}
       
   583