# HG changeset patch # User Puneeth Chaganti # Date 1291900353 -19800 # Node ID 7c5431fa2d46b171e853bc39d452009f2152af57 # Parent 9d9e4026238ff87fba675c427ea1ada11e8d9831 Removed session 3 -- basic data processing. diff -r 9d9e4026238f -r 7c5431fa2d46 day1/session3.tex --- a/day1/session3.tex Thu Dec 09 18:39:26 2010 +0530 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,583 +0,0 @@ -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%Tutorial slides on Python. -% -% Author: FOSSEE -% Copyright (c) 2009, FOSSEE, IIT Bombay -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\documentclass[14pt,compress]{beamer} -%\documentclass[draft]{beamer} -%\documentclass[compress,handout]{beamer} -%\usepackage{pgfpages} -%\pgfpagesuselayout{2 on 1}[a4paper,border shrink=5mm] - -% Modified from: generic-ornate-15min-45min.de.tex -\mode -{ - \usetheme{Warsaw} - \useoutertheme{infolines} - \setbeamercovered{transparent} -} - -\usepackage[english]{babel} -\usepackage[latin1]{inputenc} -%\usepackage{times} -\usepackage[T1]{fontenc} - -% Taken from Fernando's slides. -\usepackage{ae,aecompl} -\usepackage{mathpazo,courier,euler} -\usepackage[scaled=.95]{helvet} -\usepackage{amsmath} - -\definecolor{darkgreen}{rgb}{0,0.5,0} - -\usepackage{listings} -\lstset{language=Python, - basicstyle=\ttfamily\bfseries, - commentstyle=\color{red}\itshape, - stringstyle=\color{darkgreen}, - showstringspaces=false, - keywordstyle=\color{blue}\bfseries} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Macros -\setbeamercolor{emphbar}{bg=blue!20, fg=black} -\newcommand{\emphbar}[1] -{\begin{beamercolorbox}[rounded=true]{emphbar} - {#1} - \end{beamercolorbox} -} -\newcounter{time} -\setcounter{time}{0} -\newcommand{\inctime}[1]{\addtocounter{time}{#1}{\tiny \thetime\ m}} - -\newcommand{\typ}[1]{\lstinline{#1}} - -\newcommand{\kwrd}[1]{ \texttt{\textbf{\color{blue}{#1}}} } - -%%% This is from Fernando's setup. -% \usepackage{color} -% \definecolor{orange}{cmyk}{0,0.4,0.8,0.2} -% % Use and configure listings package for nicely formatted code -% \usepackage{listings} -% \lstset{ -% language=Python, -% basicstyle=\small\ttfamily, -% commentstyle=\ttfamily\color{blue}, -% stringstyle=\ttfamily\color{orange}, -% showstringspaces=false, -% breaklines=true, -% postbreak = \space\dots -% } - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Title page -\title[Statistics]{Python for Science and Engg:\\ Basic data processing} - -\author[FOSSEE] {FOSSEE} - -\institute[IIT Bombay] {Department of Aerospace Engineering\\IIT Bombay} - -\date[] {SciPy 2010, Introductory tutorials,\\Day 1, Session 3} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -%\pgfdeclareimage[height=0.75cm]{iitmlogo}{iitmlogo} -%\logo{\pgfuseimage{iitmlogo}} - - -%% Delete this, if you do not want the table of contents to pop up at -%% the beginning of each subsection: -\AtBeginSubsection[] -{ - \begin{frame} - \frametitle{Outline} - \tableofcontents[currentsection,currentsubsection] - \end{frame} -} - -\AtBeginSection[] -{ - \begin{frame} - \frametitle{Outline} - \tableofcontents[currentsection,currentsubsection] - \end{frame} -} - -\newcommand{\num}{\texttt{numpy}} - - -% If you wish to uncover everything in a step-wise fashion, uncomment -% the following command: -%\beamerdefaultoverlayspecification{<+->} - -%\includeonlyframes{current,current1,current2,current3,current4,current5,current6} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% DOCUMENT STARTS -\begin{document} - -\begin{frame} - \maketitle -\end{frame} - -%% \begin{frame} -%% \frametitle{Outline} -%% \tableofcontents -%% % You might wish to add the option [pausesections] -%% \end{frame} - -\section{Computing the mean} -\begin{frame} - \frametitle{Value of acceleration due to gravity?} - \begin{itemize} - \item We already have \typ{pendulum.txt} - \item We know that $ T = 2\pi \sqrt{\frac{L}{g}} $ - \item So $ g = \frac{4 \pi^2 L}{T^2} $ - \item Calculate $g$ - acceleration due to gravity for each pair of - $L$ and $T$ - \item Hence calculate mean $g$ - \end{itemize} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Acceleration due to gravity - $g$\ldots} - \begin{lstlisting} -In []: g_list = [] -In []: for line in open('pendulum.txt'): - .... point = line.split() - .... L = float(point[0]) - .... t = float(point[1]) - .... g = 4 * pi * pi * L / (t * t) - .... g_list.append(g) - \end{lstlisting} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Mean $g$ - Classical method} - \begin{lstlisting} -In []: total = 0 -In []: for g in g_list: - ....: total += g - ....: - -In []: g_mean = total / len(g_list) -In []: print 'Mean: ', g_mean - \end{lstlisting} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Mean $g$ - Slightly improved method} - \begin{lstlisting} -In []: g_mean = sum(g_list) / len(g_list) -In []: print 'Mean: ', g_mean - \end{lstlisting} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Mean $g$ - One liner} - \begin{lstlisting} -In []: g_mean = mean(g_list) -In []: print 'Mean: ', g_mean - \end{lstlisting} - \inctime{10} -\end{frame} - -\section{Processing voluminous data} -\begin{frame} - \frametitle{More on data processing} - \begin{block}{} - We have a huge data file--180,000 records.\\How do we do - \emph{efficient} statistical computations, i.e. find mean, median, - standard deviation etc.;\\How do we draw pie charts? - \end{block} -\end{frame} - -\begin{frame} - \frametitle{Structure of the file} - Understanding the structure of \typ{sslc1.txt} - \begin{itemize} - \item Each line in the file has a student's details(record) - \item Each record consists of fields separated by ';' - \end{itemize} -\emphbar{A;015162;JENIL T P;081;060;77;41;74;333;P;;} -\end{frame} - -\begin{frame} - \frametitle{Structure of the file \ldots} -\emphbar{A;015163;JOSEPH RAJ S;083;042;47;AA;72;244;;;} - Each record consists of: - \begin{itemize} - \item Region Code - \item Roll Number - \item Name - \item Marks of 5 subjects: second lang, first lang., Math, Science, - Social Studies - \item Total marks - \item Pass/Fail (P/F) - \item Withheld (W) - \end{itemize} - \inctime{5} -\end{frame} - -\begin{frame} - \frametitle{Statistical Analysis: Problem statement} - 1. Read the data supplied in the file \typ{sslc1.txt} and carry out the following: - \begin{itemize} - \item[a] Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science. - \item[b] Print mean, median and standard deviation of math scores for all regions combined. - \end{itemize} -\end{frame} - -\begin{frame} - \frametitle{Problem statement: explanation} - \emphbar{a. Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science.} -\begin{columns} - \column{5.25\textwidth} - \hspace*{.5in} -\includegraphics[height=2.6in, interpolate=true]{data/science} - \column{0.8\textwidth} -\end{columns} -\end{frame} - -\begin{frame} - \frametitle{Machinery Required} - \begin{itemize} - \item File reading - \item Parsing - \item Dictionaries - \item Arrays - \item Statistical operations - \end{itemize} -\end{frame} - -\subsection{Data processing} -\begin{frame}[fragile] - \frametitle{File reading and parsing \ldots} -\emphbar{Reading files line by line is the same as we had done with the pendulum example.} - - \begin{lstlisting} -for record in open('sslc1.txt'): - fields = record.split(';') - \end{lstlisting} -\end{frame} - -\subsection{Dictionaries} -\begin{frame}[fragile] - \frametitle{Dictionaries: Introduction} - \begin{itemize} - \item Lists index using integers\\ -Recall \typ{p = [2, 3, 5, 7]} and\\ -\typ{p[1]} is equal to \typ{3} - \item Dictionaries index using strings - \end{itemize} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Dictionaries \ldots} - \begin{lstlisting} -In []: d = {'png' : 'image file', - 'txt' : 'text file', - 'py' : 'python code', - 'java': 'bad code', - 'cpp': 'complex code'} - -In []: d['txt'] -Out[]: 'text file' - \end{lstlisting} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Dictionaries \ldots} - \begin{lstlisting} -In []: 'py' in d -Out[]: True - -In []: 'jpg' in d -Out[]: False - \end{lstlisting} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Dictionaries \ldots} - \begin{small} - \begin{lstlisting} -In []: d.keys() -Out[]: ['cpp', 'py', 'txt', 'java', 'png'] - -In []: d.values() -Out[]: ['complex code', 'python code', - 'text file', 'bad code', - 'image file'] - \end{lstlisting} - \end{small} - \inctime{10} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Inserting elements into dictionary} - \emphbar{\alert{d[key] = value}} - \begin{lstlisting} - In []: d['bin'] = 'binary file' - In []: d - Out[]: - {'bin': 'binary file', - 'cpp': 'complex code', - 'java': 'bad code', - 'png': 'image file', - 'py': 'python code', - 'txt': 'text file'} - \end{lstlisting} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Getting back to the problem} - Let our dictionary be: - \begin{lstlisting} -science = {} - \end{lstlisting} -\begin{itemize} - \item Keys will be region codes - \item Values will be the number students who scored more than 90\% in that region in Science - \end{itemize} - \begin{block}{Sample \typ{science} dictionary} - \{'A': 729, 'C': 764, 'B': 1120,'E': 414, 'D': 603, 'F': 500\} - \end{block} - -\end{frame} - -\begin{frame}[fragile] - \frametitle{Building parsed data \ldots} - \begin{lstlisting} -science = {} - -for record in open('sslc1.txt'): - fields = record.split(';') - - region_code = fields[0].strip() - \end{lstlisting} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Building parsed data \ldots} - \begin{lstlisting} - if region_code not in science: - science[region_code] = 0 - - score_str = fields[6].strip() - - score = 0 - if score_str != 'AA': - score = int(score_str) - - if score > 90: - science[region_code] += 1 - \end{lstlisting} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Building parsed data \ldots} - \begin{lstlisting} -print science -print science.keys() -print science.values() - \end{lstlisting} -\end{frame} - -\subsection{Visualizing data} -\begin{frame}[fragile] - \frametitle{Pie Chart} - \begin{lstlisting} - pie(science.values()) - \end{lstlisting} -\includegraphics[height=2in, interpolate=true]{data/science_nolabel} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Pie chart} - \small - \begin{lstlisting} -pie(science.values(), - labels = science.keys()) -title('Students scoring 90% and above - in science by region') -savefig('science.png') - \end{lstlisting} -\begin{columns} - \column{5.25\textwidth} - \hspace*{1.1in} -\includegraphics[height=2in, interpolate=true]{data/science} - \column{0.8\textwidth} -\end{columns} - \inctime{10} -\end{frame} - -\begin{frame} - \frametitle{Problem statement} - \emphbar{b. Print mean, median and standard deviation of math scores for all regions combined.} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Building data for statistics} - \begin{lstlisting} -math_scores = [] - -for record in open('sslc1.txt'): - fields = record.split(';') - - score_str = fields[5].strip() - score = 0 - if score_str != 'AA': - score = int(score_str) - - math_scores.append(score) - \end{lstlisting} -\end{frame} - -\subsection{Obtaining statistics} -\begin{frame}[fragile] - \frametitle{Obtaining statistics} - \begin{lstlisting} -print 'Mean: ', mean(math_scores) - -print 'Median: ', median(math_scores) - -print 'Standard Deviation: ', - std(math_scores) - \end{lstlisting} - \inctime{10} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Obtaining statistics: efficiently!} - \begin{lstlisting} -math_array = array(math_scores) - -print 'Mean: ', mean(math_array) - -print 'Median: ', median(math_array) - -print 'Standard Deviation: ', - std(math_array) - \end{lstlisting} - \inctime{5} -\end{frame} - -\begin{frame}[fragile] - \frametitle{IPython tip: Timing} - -Try the following: - \begin{lstlisting} -In []: %timeit mean(math_scores) - -In []: %timeit mean(math_array) - -In []: %timeit? - - \end{lstlisting} - - \begin{itemize} - \item \typ{\%timeit}: accurate, many measurements - \item Can also use \typ{\%time} - \item \typ{\%time}: less accurate, one measurement - \end{itemize} - - \inctime{5} -\end{frame} - -\begin{frame}[fragile] - \frametitle{What tools did we use?} - \begin{itemize} - \item More parsing data - \item Dictionaries for storing data - \item Facilities for drawing pie charts - \item Functions for statistical computations - mean, median, standard deviation - \item Efficient array manipulations - \item Timing in IPython - \end{itemize} - -\end{frame} - -\end{document} - -%% Questions for Quiz %% -%% ------------------ %% - -\begin{frame} -\frametitle{\incqno } - A sample line from a Comma Separated Values (CSV) file:\\ - \vspace*{0.2in} - \emph{Rossum, Guido, 42, 56, 34, 54}\\ - \vspace*{0.2in} - What code would you use to separate the line into fields? -\end{frame} - -\begin{frame}[fragile] -\frametitle{\incqno } - \begin{lstlisting} - In []: a = [1, 2, 5, 9] - \end{lstlisting} - How do you find the length of this list? -\end{frame} - -\begin{frame}[fragile] -\frametitle{\incqno } - \begin{lstlisting} - In [1]: d = { - 'a': 1, - 'b': 2 - } - In [2]: print d['c'] - \end{lstlisting} - What is the output? -\end{frame} - -\begin{frame}[fragile] -\frametitle{\incqno } -\begin{lstlisting} -In []: sc = {'A': 10, 'B': 20, - 'C': 70} -\end{lstlisting} -Given the above dictionary, what command will you give to plot a -pie-chart? -\end{frame} - -\begin{frame}[fragile] -\frametitle{\incqno } -\begin{lstlisting} -In []: marks = [10, 20, 30, 50, 55, - 75, 83] -\end{lstlisting} -Given the above marks, how will you calculate the \alert{mean} and -\alert{standard deviation}? -\end{frame} - -\begin{frame}[fragile] -\frametitle{\incqno } -\begin{lstlisting} -In []: marks = [10, 20, 30, 50, 55, - 75, 83] -\end{lstlisting} -How will you convert the list \texttt{marks} to an \alert{array}? -\end{frame} - -%% \begin{frame}[fragile] -%% \frametitle{\incqno } -%% \begin{lstlisting} -%% for x in "abcd": -%% print x - -%% a -%% b -%% c -%% d -%% \end{lstlisting} -%% How do you get the following output? -%% \begin{lstlisting} -%% 0 a -%% 1 b -%% 2 c -%% 3 d -%% \end{lstlisting} -%% \end{frame} -