%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Tutorial slides on Python.%% Author: FOSSEE% Copyright (c) 2009, FOSSEE, IIT Bombay%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\documentclass[14pt,compress]{beamer}%\documentclass[draft]{beamer}%\documentclass[compress,handout]{beamer}%\usepackage{pgfpages} %\pgfpagesuselayout{2 on 1}[a4paper,border shrink=5mm]% Modified from: generic-ornate-15min-45min.de.tex\mode<presentation>{ \usetheme{Warsaw} \useoutertheme{infolines} \setbeamercovered{transparent}}\usepackage[english]{babel}\usepackage[latin1]{inputenc}%\usepackage{times}\usepackage[T1]{fontenc}% Taken from Fernando's slides.\usepackage{ae,aecompl}\usepackage{mathpazo,courier,euler}\usepackage[scaled=.95]{helvet}\usepackage{amsmath}\definecolor{darkgreen}{rgb}{0,0.5,0}\usepackage{listings}\lstset{language=Python, basicstyle=\ttfamily\bfseries, commentstyle=\color{red}\itshape, stringstyle=\color{darkgreen}, showstringspaces=false, keywordstyle=\color{blue}\bfseries}%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Macros\setbeamercolor{emphbar}{bg=blue!20, fg=black}\newcommand{\emphbar}[1]{\begin{beamercolorbox}[rounded=true]{emphbar} {#1} \end{beamercolorbox}}\newcounter{time}\setcounter{time}{0}\newcommand{\inctime}[1]{\addtocounter{time}{#1}{\tiny \thetime\ m}}\newcommand{\typ}[1]{\lstinline{#1}}\newcommand{\kwrd}[1]{ \texttt{\textbf{\color{blue}{#1}}} }%%% This is from Fernando's setup.% \usepackage{color}% \definecolor{orange}{cmyk}{0,0.4,0.8,0.2}% % Use and configure listings package for nicely formatted code% \usepackage{listings}% \lstset{% language=Python,% basicstyle=\small\ttfamily,% commentstyle=\ttfamily\color{blue},% stringstyle=\ttfamily\color{orange},% showstringspaces=false,% breaklines=true,% postbreak = \space\dots% }%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Title page\title[Statistics]{Python for Science and Engg: Statistics}\author[FOSSEE] {FOSSEE}\institute[IIT Bombay] {Department of Aerospace Engineering\\IIT Bombay}\date[] {31 October, 2009\\Day 1, Session 3}%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\pgfdeclareimage[height=0.75cm]{iitmlogo}{iitmlogo}%\logo{\pgfuseimage{iitmlogo}}%% Delete this, if you do not want the table of contents to pop up at%% the beginning of each subsection:\AtBeginSubsection[]{ \begin{frame}<beamer> \frametitle{Outline} \tableofcontents[currentsection,currentsubsection] \end{frame}}\AtBeginSection[]{ \begin{frame}<beamer> \frametitle{Outline} \tableofcontents[currentsection,currentsubsection] \end{frame}}\newcommand{\num}{\texttt{numpy}}% If you wish to uncover everything in a step-wise fashion, uncomment% the following command: %\beamerdefaultoverlayspecification{<+->}%\includeonlyframes{current,current1,current2,current3,current4,current5,current6}%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% DOCUMENT STARTS\begin{document}\begin{frame} \maketitle\end{frame}%% \begin{frame}%% \frametitle{Outline}%% \tableofcontents%% % You might wish to add the option [pausesections]%% \end{frame}\section{Processing voluminous data}\begin{frame} \frametitle{More on data processing} \begin{block}{} We have a huge--1m records--data file.\\How do we do \emph{efficient} statistical computations, that is find mean, median, mode, standard deveiation etc; draw pie charts? \end{block}\end{frame}\begin{frame} \frametitle{Statistical Analysis: Problem statement} Read the data supplied in \emph{sslc1.txt} and carry out the following: \begin{enumerate} \item Draw a pie chart representing the proportion of students who scored more than 90\% in each region in Science. \item Draw a pie chart representing the proportion of students who scored more than 90\% in each subject across regions. \item Print mean, median, mode and standard deviation of math scores for all regions combined. \end{enumerate}\end{frame}\begin{frame} \frametitle{Problem statement: explanation} \emphbar{Draw a pie chart representing the proportion of students who scored more than 90\% in each region in Science.} \begin{enumerate} \item Complete(100\%) data - Number of students who scored more than 90\% in Science \item Each slice - Number of students who scored more than 90\% in Science in one region \end{enumerate}\end{frame}\begin{frame} \frametitle{Problem statement: explanation} \emphbar{Draw a pie chart representing the proportion of students who scored more than 90\% in each subject across regions.} \begin{enumerate} \item Complete(100\%) data - Number of students who scored more than 90\% across all regions \item Each slice - Number of students who scored more than 90\% in each subject across all regions \end{enumerate}\end{frame}\begin{frame} \frametitle{Statistical Analysis and Parsing \ldots} Machinery Required - \begin{itemize} \item File reading \item Parsing \item Dictionaries \item NumPy arrays \item Statistical operations \end{itemize}\end{frame}\begin{frame} \frametitle{File reading and parsing} Understanding the structure of sslc1.txt \begin{itemize} \item One line in file corresponds to a student's details \item aka record \item Each record consists of fields separated by ';' \end{itemize}\end{frame}\begin{frame} \frametitle{File reading and parsing \ldots} Each record consists of: \begin{itemize} \item Region Code \item Roll Number \item Name \item Marks of 5 subjects: English, Hindi, Maths, Science, Social \item Total marks \item Pass/Fail (P/F) \item Withdrawn (W) \end{itemize} \inctime{5}\end{frame}\subsection{Data processing}\begin{frame}[fragile] \frametitle{File reading and parsing \ldots} \begin{lstlisting}for record in open('sslc1.txt'): fields = record.split(';') \end{lstlisting}\end{frame}\subsection{Dictionary}\begin{frame}[fragile] \frametitle{Dictionary: Introduction} \begin{itemize} \item lists index: 0 \ldots n \item dictionaries index using strings \end{itemize} \begin{block}{Example}d = \{ ``Hitchhiker's guide'' : 42, ``Terminator'' : ``I'll be back''\}\\d[``Terminator''] => ``I'll be back'' \end{block}\end{frame}\begin{frame}[fragile] \frametitle{Dictionary: Introduction} \begin{lstlisting}In [1]: d = {"Hitchhiker's guide" : 42, "Terminator" : "I'll be back"}In [2]: d["Hitchhiker's guide"]Out[2]: 42In [3]: "Hitchhiker's guide" in dOut[3]: TrueIn [4]: "Guido" in dOut[4]: False \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Dictionary: Introduction} \begin{lstlisting}In [5]: d.keys()Out[5]: ['Terminator', "Hitchhiker's guide"]In [6]: d.values()Out[6]: ["I'll be back", 42] \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Back to lists: Iterating} \begin{itemize} \item Python's \kwrd{for} loop iterates through list items \item In other languages (C/C++) we run through indices and pick items from the array using these indices \item In Python, while iterating through list items current position is not available \end{itemize} \begin{block}{Iterating through indices} What if we want the index of an item of a list? \end{block}\end{frame}\begin{frame}[fragile] \frametitle{enumerate: Iterating through list indices} \begin{lstlisting}In [1]: names = ["Guido","Alex", "Tim"]In [2]: for i, name in enumerate(names): ...: print i, name ...: 0 Guido1 Alex2 Tim \end{lstlisting} \inctime{5}\end{frame}\begin{frame}[fragile] \frametitle{Continuing with our Dictionary} Let our dictionary be: \begin{lstlisting}science = {} # is an empty dictionary \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Dictionary - Building parsed data} \begin{itemize} \item \emph{Keys} of \emph{science} will be region codes \item Value of a \emph{science} will be the number students who scored more than 90\% in that region \end{itemize}\end{frame}\begin{frame}[fragile] \frametitle{Building parsed data \ldots} \begin{lstlisting}from pylab import piescience = {}for record in open('sslc1.txt'): record = record.strip() fields = record.split(';') region_code = fields[0].strip() \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Building parsed data \ldots} \begin{lstlisting}if region_code not in science: science[region_code] = 0score_str = fields[4].strip()score = int(score_str) if score_str != 'AA' else 0if score > 90: science[region_code] += 1 \end{lstlisting}\end{frame}\subsection{Visualizing data}\begin{frame}[fragile] \frametitle{Pie charts} \small \begin{lstlisting}figure(1)pie(science.values(), labels=science.keys())title('Students scoring 90% and above in science by region')savefig('/tmp/science.png') \end{lstlisting}\begin{columns} \column{5.25\textwidth} \hspace*{1.1in}\includegraphics[height=2in, interpolate=true]{data/science} \column{0.8\textwidth}\end{columns} \inctime{5}\end{frame}\begin{frame}[fragile] \frametitle{Building data for all subjects \ldots} \begin{lstlisting}from pylab import piefrom scipy import mean, median, stdfrom scipy import statsscores = [[], [], [], [], []]ninety_percents = [{}, {}, {}, {}, {}] \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Building data for all subjects \ldots} \begin{lstlisting}for record in open('sslc1.txt'): record = record.strip() fields = record.split(';') region_code = fields[0].strip() \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Building data for all subjects \ldots} \small \begin{lstlisting}for i, field in enumerate(fields[3:8]): if region_code not in ninety_percents[i]: ninety_percents[i][region_code] = 0 score_str = field.strip() score = int(score_str) if score_str != 'AA' else 0 scores[i].append(score) if score > 90: ninety_percents[i][region_code] += 1 \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Consolidating data} \begin{lstlisting}subj_total = []for subject in ninety_percents: subj_total.append(sum( subject.values())) \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Pie charts} \begin{lstlisting}figure(2)pie(subj_total, labels=['English', 'Hindi', 'Maths', 'Science', 'Social'])title('Students scoring more than 90% by subject(All regions combined).')savefig('/tmp/all_regions.png') \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Pie charts} \includegraphics[height=3in, interpolate=true]{data/all_regions}\end{frame}\subsection{Obtaining statistics}\begin{frame}[fragile] \frametitle{Obtaining statistics} \begin{block}{Statistics: Mean} Obtain the mean of Math scores \end{block}\end{frame}\begin{frame}[fragile] \frametitle{Obtaining statistics: Solution} \begin{block}{Statistics: Mean} Obtain the mean of Math scores \end{block} \begin{lstlisting}math_scores = scores[2]total = 0for i, score in enumerate(math_scores): total += scoremean = total / (i + 1)print "Mean: ", mean \end{lstlisting}\end{frame}\begin{frame}[fragile] \frametitle{Obtaining statistics: Another solution} \begin{block}{Statistics: Mean} Obtain the mean of Math scores \end{block} \begin{lstlisting}math_scores = scores[2]mean = sum(math_scores) / len(math_scores) \end{lstlisting}\end{frame}\begin{frame}[fragile]\frametitle{NumPy arrays} \begin{itemize} \item NumPy provides arrays \item arrays are very efficient and powerful \item Very easy to perform element-wise operations - \typ{+, -, *, /, \%} \begin{lstlisting}In [1]: a = array([1, 2, 3])In [2]: b = array([4, 5, 6])In [3]: a + bOut[3]: array([5, 7, 9]) \end{lstlisting} \item Very easy to compute statistics \end{itemize}\end{frame}\begin{frame}[fragile] \frametitle{Obtaining statistics} \begin{lstlisting}math_scores = array(scores[2])print "Mean: ", mean(math_scores)print "Median: ", median(math_scores)print "Mode: ", stats.mode(math_scores)print "Standard Deviation: ", std(math_scores) \end{lstlisting} \inctime{15}\end{frame}\begin{frame}[fragile] \frametitle{What tools did we use?} \begin{itemize} \item Dictionaries for storing data \item Facilities for drawing pie charts \item NumPy arrays for efficient array manipulations \item Functions for statistical computations - mean, median, mode, standard deviation \end{itemize}\end{frame}\section{Least square fit}\begin{frame}\frametitle{L vs $T^2$ \ldots}Let's go back to the L vs $T^2$ plot\begin{itemize}\item We first look at obtaining $T^2$ from T\item Then, we look at plotting a Least Squares fit\end{itemize}\end{frame}\begin{frame}[fragile]\frametitle{Dealing with data whole-sale}\begin{lstlisting}In []: for t in T: ....: TSq.append(t*t)\end{lstlisting}\begin{itemize}\item This is not very efficient\item We are squaring element after element\item We use arrays to make this efficient\end{itemize}\begin{lstlisting}In []: L = array(L)In []: T = array(T)In []: TSq = T*T\end{lstlisting}\end{frame}\end{document}