day1/session3.tex
author Madhusudan.C.S <madhusudancs@gmail.com>
Wed, 28 Oct 2009 20:33:17 +0530
changeset 252 359f77927931
parent 249 135062d6f91f
parent 243 33ede079857d
child 263 8a4a1e5aec85
permissions -rw-r--r--
Merged Madhu and Mainline branches.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Tutorial slides on Python.
%
% Author: FOSSEE
% Copyright (c) 2009, FOSSEE, IIT Bombay
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\documentclass[14pt,compress]{beamer}
%\documentclass[draft]{beamer}
%\documentclass[compress,handout]{beamer}
%\usepackage{pgfpages} 
%\pgfpagesuselayout{2 on 1}[a4paper,border shrink=5mm]

% Modified from: generic-ornate-15min-45min.de.tex
\mode<presentation>
{
  \usetheme{Warsaw}
  \useoutertheme{infolines}
  \setbeamercovered{transparent}
}

\usepackage[english]{babel}
\usepackage[latin1]{inputenc}
%\usepackage{times}
\usepackage[T1]{fontenc}

% Taken from Fernando's slides.
\usepackage{ae,aecompl}
\usepackage{mathpazo,courier,euler}
\usepackage[scaled=.95]{helvet}
\usepackage{amsmath}

\definecolor{darkgreen}{rgb}{0,0.5,0}

\usepackage{listings}
\lstset{language=Python,
    basicstyle=\ttfamily\bfseries,
    commentstyle=\color{red}\itshape,
  stringstyle=\color{darkgreen},
  showstringspaces=false,
  keywordstyle=\color{blue}\bfseries}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Macros
\setbeamercolor{emphbar}{bg=blue!20, fg=black}
\newcommand{\emphbar}[1]
{\begin{beamercolorbox}[rounded=true]{emphbar} 
      {#1}
 \end{beamercolorbox}
}
\newcounter{time}
\setcounter{time}{0}
\newcommand{\inctime}[1]{\addtocounter{time}{#1}{\tiny \thetime\ m}}

\newcommand{\typ}[1]{\lstinline{#1}}

\newcommand{\kwrd}[1]{ \texttt{\textbf{\color{blue}{#1}}}  }

%%% This is from Fernando's setup.
% \usepackage{color}
% \definecolor{orange}{cmyk}{0,0.4,0.8,0.2}
% % Use and configure listings package for nicely formatted code
% \usepackage{listings}
% \lstset{
%    language=Python,
%    basicstyle=\small\ttfamily,
%    commentstyle=\ttfamily\color{blue},
%    stringstyle=\ttfamily\color{orange},
%    showstringspaces=false,
%    breaklines=true,
%    postbreak = \space\dots
% }

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Title page
\title[Statistics]{Python for Science and Engg: Statistics}

\author[FOSSEE] {FOSSEE}

\institute[IIT Bombay] {Department of Aerospace Engineering\\IIT Bombay}
\date[] {31, October 2009\\Day 1, Session 3}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%\pgfdeclareimage[height=0.75cm]{iitmlogo}{iitmlogo}
%\logo{\pgfuseimage{iitmlogo}}


%% Delete this, if you do not want the table of contents to pop up at
%% the beginning of each subsection:
\AtBeginSubsection[]
{
  \begin{frame}<beamer>
    \frametitle{Outline}
    \tableofcontents[currentsection,currentsubsection]
  \end{frame}
}

\AtBeginSection[]
{
  \begin{frame}<beamer>
    \frametitle{Outline}
    \tableofcontents[currentsection,currentsubsection]
  \end{frame}
}

\newcommand{\num}{\texttt{numpy}}


% If you wish to uncover everything in a step-wise fashion, uncomment
% the following command: 
%\beamerdefaultoverlayspecification{<+->}

%\includeonlyframes{current,current1,current2,current3,current4,current5,current6}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DOCUMENT STARTS
\begin{document}

\begin{frame}
  \maketitle
\end{frame}

%% \begin{frame}
%%   \frametitle{Outline}
%%   \tableofcontents
%%   % You might wish to add the option [pausesections]
%% \end{frame}

\section{Processing voluminous data}
\begin{frame}
  \frametitle{More on data processing}
  \begin{block}{}
    We have a huge--1m records--data file.\\How do we do \emph{efficient} statistical computations, that is find mean, median, mode, standard deveiation etc; draw pie charts?
  \end{block}
\end{frame}


\begin{frame}
  \frametitle{Statistical Analysis: Problem statement}
  Read the data supplied in \emph{sslc1.txt} and carry out the following:
  \begin{enumerate}
    \item Draw a pie chart representing the proportion of students who scored more than 90\% in each region in Science.
    \item Draw a pie chart representing the proportion of students who scored more than 90\% in each subject across regions.
    \item Print mean, median, mode and standard deviation of math scores for all regions combined.
  \end{enumerate}
\end{frame}

\begin{frame}
  \frametitle{Problem statement: explanation}
    \emphbar{Draw a pie chart representing the proportion of students who scored more than 90\% in each region in Science.}
    \begin{enumerate}
      \item Complete(100\%) data - Number of students who scored more than 90\% in Science
      \item Each slice - Number of students who scored more than 90\% in Science in one region
    \end{enumerate}
\end{frame}

\begin{frame}
  \frametitle{Problem statement: explanation}
    \emphbar{Draw a pie chart representing the proportion of students who scored more than 90\% in each subject across regions.}
    \begin{enumerate}
      \item Complete(100\%) data - Number of students who scored more than 90\% across all regions
      \item Each slice - Number of students who scored more than 90\% in each subject across all regions
    \end{enumerate}
\end{frame}

\begin{frame}
  \frametitle{Statistical Analysis and Parsing \ldots}
  Machinery Required -
  \begin{itemize}
    \item File reading
    \item Parsing
    \item Dictionaries
    \item NumPy arrays
    \item Statistical operations
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{File reading and parsing}
  Understanding the structure of sslc1.txt
  \begin{itemize}
    \item One line in file corresponds to a student's details
    \item aka record
    \item Each record consists of fields separated by ';'
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{File reading and parsing \ldots}
  Each record consists of:
  \begin{itemize}
    \item Region Code
    \item Roll Number
    \item Name
    \item Marks of 5 subjects: English, Hindi, Maths, Science, Social
    \item Total marks
    \item Pass/Fail (P/F)
    \item Withdrawn (W)
  \end{itemize}
  \inctime{5}
\end{frame}

\subsection{Data processing}
\begin{frame}[fragile]
  \frametitle{File reading and parsing \ldots}
  \begin{lstlisting}
for record in open('sslc1.txt'):
    fields = record.split(';')
  \end{lstlisting}
\end{frame}

\subsection{Dictionary}
\begin{frame}[fragile]
  \frametitle{Dictionary: Introduction}
  \begin{itemize}
    \item lists index: 0 \ldots n
    \item dictionaries index using strings
  \end{itemize}
  \begin{block}{Example}
d = \{ ``Hitchhiker's guide'' : 42,
     ``Terminator'' : ``I'll be back''\}\\
d[``Terminator''] => ``I'll be back''
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Dictionary: Introduction}
  \begin{lstlisting}
In [1]: d = {"Hitchhiker's guide" : 42,
      "Terminator" : "I'll be back"}

In [2]: d["Hitchhiker's guide"]
Out[2]: 42

In [3]: "Hitchhiker's guide" in d
Out[3]: True

In [4]: "Guido" in d
Out[4]: False
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Dictionary: Introduction}
  \begin{lstlisting}
In [5]: d.keys()
Out[5]: ['Terminator', "Hitchhiker's 
                              guide"]

In [6]: d.values()
Out[6]: ["I'll be back", 42]
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Back to lists: Iterating}
  \begin{itemize}
    \item Python's \kwrd{for} loop iterates through list items
    \item In other languages (C/C++) we run through indices and pick items from the array using these indices
    \item In Python, while iterating through list items current position is not available
  \end{itemize}
  \begin{block}{Iterating through indices}
    What if we want the index of an item of a list?
  \end{block}

\end{frame}

\begin{frame}[fragile]
  \frametitle{enumerate: Iterating through list indices}
  \begin{lstlisting}
In [1]: names = ["Guido","Alex", "Tim"]

In [2]: for i, name in enumerate(names):
   ...:     print i, name
   ...: 
0 Guido
1 Alex
2 Tim
  \end{lstlisting}
  \inctime{5}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Continuing with our Dictionary}
  Let our dictionary be:
  \begin{lstlisting}
science = {} # is an empty dictionary
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Dictionary - Building parsed data}
  \begin{itemize}
    \item \emph{Keys} of \emph{science} will be region codes
    \item Value of a \emph{science} will be the number students who scored more than 90\% in that region
  \end{itemize}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Building parsed data \ldots}
  \begin{lstlisting}
from pylab import pie

science = {}

for record in open('sslc1.txt'):
    record = record.strip()
    fields = record.split(';')

    region_code = fields[0].strip()
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Building parsed data \ldots}
  \begin{lstlisting}
if region_code not in science:
    science[region_code] = 0

score_str = fields[4].strip()

score = int(score_str) if
    score_str != 'AA' else 0

if score > 90:
    science[region_code] += 1
  \end{lstlisting}
\end{frame}

\subsection{Visualizing data}
\begin{frame}[fragile]
  \frametitle{Pie charts}
  \small
  \begin{lstlisting}
figure(1)
pie(science.values(), 
    labels=science.keys())
title('Students scoring 90% and above 
      in science by region')
savefig('/tmp/science.png')
  \end{lstlisting}
\begin{columns}
    \column{5.25\textwidth}
    \hspace*{1.1in}
\includegraphics[height=2in, interpolate=true]{data/science}
    \column{0.8\textwidth}
\end{columns}
  \inctime{5}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Building data for all subjects \ldots}
  \begin{lstlisting}
from pylab import pie
from scipy import mean, median, std
from scipy import stats

scores = [[], [], [], [], []]
ninety_percents = [{}, {}, {}, {}, {}]
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Building data for all subjects \ldots}
  \begin{lstlisting}
for record in open('sslc1.txt'):
    record = record.strip()
    fields = record.split(';')

    region_code = fields[0].strip()
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Building data for all subjects \ldots}
  \small
  \begin{lstlisting}
for i, field in enumerate(fields[3:8]):
    if region_code not in ninety_percents[i]:
        ninety_percents[i][region_code] = 0

    score_str = field.strip()
    score = int(score_str) if
      score_str != 'AA' else 0

    scores[i].append(score)

    if score > 90:
        ninety_percents[i][region_code] += 1
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Consolidating data}
  \begin{lstlisting}
subj_total = []
for subject in ninety_percents:
    subj_total.append(sum(
         subject.values()))
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Pie charts}
  \begin{lstlisting}
figure(2)
pie(subj_total, labels=['English',
    'Hindi', 'Maths', 'Science',
    'Social'])
title('Students scoring more than
      90% by subject(All regions
      combined).')
savefig('/tmp/all_regions.png')
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Pie charts}
  \includegraphics[height=3in, interpolate=true]{data/all_regions}
\end{frame}

\subsection{Obtaining statistics}
\begin{frame}[fragile]
  \frametitle{Obtaining statistics}
  \begin{block}{Statistics: Mean}
    Obtain the mean of Math scores
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Obtaining statistics: Solution}
  \begin{block}{Statistics: Mean}
    Obtain the mean of Math scores
  \end{block}
  \begin{lstlisting}
math_scores = scores[2]
total = 0
for i, score in enumerate(math_scores):
    total += score

mean = total / (i + 1)
print "Mean: ", mean
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Obtaining statistics: Another solution}
  \begin{block}{Statistics: Mean}
    Obtain the mean of Math scores
  \end{block}
  \begin{lstlisting}
math_scores = scores[2]
mean = sum(math_scores) /
          len(math_scores)
  \end{lstlisting}
\end{frame}

\begin{frame}[fragile]
\frametitle{NumPy arrays}
  \begin{itemize}
    \item NumPy provides arrays
    \item arrays are very efficient and powerful 
    \item Very easy to perform element-wise operations - \typ{+, -, *, /, \%}
    \begin{lstlisting}
In [1]: a = array([1, 2, 3])
In [2]: b = array([4, 5, 6])

In [3]: a + b
Out[3]: array([5, 7, 9])
    \end{lstlisting}
    \item Very easy to compute statistics
  \end{itemize}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Obtaining statistics}
  \begin{lstlisting}
math_scores = array(scores[2])

print "Mean: ", mean(math_scores)

print "Median: ", median(math_scores)

print "Mode: ", stats.mode(math_scores)

print "Standard Deviation: ",
              std(math_scores)
  \end{lstlisting}
  \inctime{15}
\end{frame}

\begin{frame}[fragile]
  \frametitle{What tools did we use?}
  \begin{itemize}
   \item Dictionaries for storing data
   \item Facilities for drawing pie charts
   \item NumPy arrays for efficient array manipulations
   \item Functions for statistical computations - mean, median, mode, standard deviation
  \end{itemize}
\end{frame}

\section{Least square fit}
\begin{frame}
\frametitle{L vs $T^2$ \ldots}
Let's go back to the L vs $T^2$ plot
\begin{itemize}
\item We first look at obtaining $T^2$ from T
\item Then, we look at plotting a Least Squares fit
\end{itemize}
\end{frame}

\begin{frame}[fragile]
\frametitle{Dealing with data whole-sale}
\begin{lstlisting}
In []: for t in T:
 ....:     TSq.append(t*t)
\end{lstlisting}
\begin{itemize}
\item This is not very efficient
\item We are squaring element after element
\item We use arrays to make this efficient
\end{itemize}
\begin{lstlisting}
In []: L = array(L)
In []: T = array(T)
In []: TSq = T*T
\end{lstlisting}
\end{frame}

\begin{frame}[fragile]
\frametitle{Least Squares Fit}
\vspace{-0.15in}
\begin{figure}
\includegraphics[width=4in]{data/L-Tsq-Line.png}
\end{figure}
\end{frame}

\begin{frame}[fragile]
\frametitle{Least Squares Fit}
\vspace{-0.15in}
\begin{figure}
\includegraphics[width=4in]{data/L-Tsq-points.png}
\end{figure}
\end{frame}

\begin{frame}[fragile]
\frametitle{Least Squares Fit}
\vspace{-0.15in}
\begin{figure}
\includegraphics[width=4in]{data/least-sq-fit.png}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Least Square Fit Curve}
\begin{itemize}
\item $T^2$ and $L$ have a linear relationship
\item Hence, Least Square Fit Curve is a line
\item we shall use the \typ{lstsq} function
\end{itemize}
\end{frame}

\begin{frame}[fragile]
\frametitle{\typ{lstsq}}
\begin{itemize}
\item We need to fit a line through points for the equation $T^2 = m \cdot L+c$
\item The equation can be re-written as $T^2 = A \cdot p$
\item where A is   
  $\begin{bmatrix}
  L_1 & 1 \\
  L_2 & 1 \\
  \vdots & \vdots\\
  L_N & 1 \\
  \end{bmatrix}$
  and p is 
  $\begin{bmatrix}
  m\\
  c\\
  \end{bmatrix}$
\item We need to find $p$ to plot the line
\end{itemize}
\end{frame}

\subsection{Van der Monde matrix generation}
\begin{frame}[fragile]
\frametitle{Van der Monde Matrix}
\begin{itemize}
\item A is also called a Van der Monde matrix
\item It can be generated using \typ{vander}
\end{itemize}
\begin{lstlisting}
In []: A = vander(L, 2)
\end{lstlisting}
Gives the required Van der Monde matrix
\begin{equation*}
  \begin{bmatrix}
    l_1 & 1 \\
    l_2 & 1 \\
    \vdots & \vdots\\
    l_N & 1 \\
  \end{bmatrix}
\end{equation*}

\end{frame}

\begin{frame}[fragile]
\frametitle{\typ{lstsq} \ldots}
\begin{itemize}
\item Now use the \typ{lstsq} function
\item Along with a lot of things, it returns the least squares solution
\end{itemize}
\begin{lstlisting}
In []: coef, res, r, s = lstsq(A,TSq)
\end{lstlisting}
\end{frame}

\subsection{Plotting}
\begin{frame}[fragile]
\frametitle{Least Square Fit Line \ldots}
We get the points of the line from \typ{coef}
\begin{lstlisting}
In []: Tline = coef[0]*L + coef[1]
\end{lstlisting}
\begin{itemize}
\item Now plot Tline vs. L, to get the Least squares fit line. 
\end{itemize}
\begin{lstlisting}
In []: plot(L, Tline)
\end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{What did we learn?}
  \begin{itemize}
   \item Least square fit
   \item Van der Monde matrix generation
   \item Plotting the least square fit curve
  \end{itemize}
\end{frame}

\end{document}