Added script for sslc.txt file and presentation.
authorShantanu <shantanu@fossee.in>
Mon, 12 Apr 2010 20:46:40 +0530
changeset 46 34df59770550
parent 45 9d61db7bf2f4
child 47 501e3fb21e3c
child 48 c0a48af139d2
Added script for sslc.txt file and presentation.
presentations/statistics.tex
statistics-script
statistics.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/presentations/statistics.tex	Mon Apr 12 20:46:40 2010 +0530
@@ -0,0 +1,168 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%Tutorial slides on Python.
+%
+% Author: FOSSEE 
+% Copyright (c) 2009, FOSSEE, IIT Bombay
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\documentclass[14pt,compress]{beamer}
+%\documentclass[draft]{beamer}
+%\documentclass[compress,handout]{beamer}
+%\usepackage{pgfpages} 
+%\pgfpagesuselayout{2 on 1}[a4paper,border shrink=5mm]
+
+% Modified from: generic-ornate-15min-45min.de.tex
+\mode<presentation>
+{
+  \usetheme{Warsaw}
+  \useoutertheme{infolines}
+  \setbeamercovered{transparent}
+}
+
+\usepackage[english]{babel}
+\usepackage[latin1]{inputenc}
+%\usepackage{times}
+\usepackage[T1]{fontenc}
+
+% Taken from Fernando's slides.
+\usepackage{ae,aecompl}
+\usepackage{mathpazo,courier,euler}
+\usepackage[scaled=.95]{helvet}
+
+\definecolor{darkgreen}{rgb}{0,0.5,0}
+
+\usepackage{listings}
+\lstset{language=Python,
+    basicstyle=\ttfamily\bfseries,
+    commentstyle=\color{red}\itshape,
+  stringstyle=\color{darkgreen},
+  showstringspaces=false,
+  keywordstyle=\color{blue}\bfseries}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Macros
+\setbeamercolor{emphbar}{bg=blue!20, fg=black}
+\newcommand{\emphbar}[1]
+{\begin{beamercolorbox}[rounded=true]{emphbar} 
+      {#1}
+ \end{beamercolorbox}
+}
+\newcounter{time}
+\setcounter{time}{0}
+\newcommand{\inctime}[1]{\addtocounter{time}{#1}{\tiny \thetime\ m}}
+
+\newcommand{\typ}[1]{\lstinline{#1}}
+
+\newcommand{\kwrd}[1]{ \texttt{\textbf{\color{blue}{#1}}}  }
+
+% Title page
+\title{Python for Scientific Computing : Large Scale Data Processing}
+
+\author[FOSSEE] {FOSSEE}
+
+\institute[IIT Bombay] {Department of Aerospace Engineering\\IIT Bombay}
+\date{}
+
+% DOCUMENT STARTS
+\begin{document}
+
+\begin{frame}
+  \maketitle
+\end{frame}
+
+\begin{frame}
+  \frametitle{About the Session}
+  \begin{block}{Goal}
+    We read and process large data file to solve a problem.
+  \end{block}
+  \begin{block}{Checklist}
+    \begin{itemize}
+    \item sslc.txt
+  \end{itemize}
+  \end{block}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Structure of the file}
+  Understanding the structure of sslc1.txt
+  \begin{itemize}
+    \item Each line in the file has a student's details(record)
+    \item Each record consists of fields separated by ';'
+  \end{itemize}
+\emphbar{A;015162;JENIL T P;081;060;77;41;74;333;P;;}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Structure of the file \ldots}
+\emphbar{A;015163;JOSEPH RAJ S;083;042;47;AA;72;244;;;}
+  Each record consists of:
+  \begin{itemize}
+    \item Region Code : 'A'
+    \item Roll Number : '015163'
+    \item Name : 'JOSEPH RAJ S'
+    \item Marks of 5 subjects: English(083), Hindi(042), Maths(47), Science(AA), Social(72)
+    \item Total marks : 244
+    \item Pass/Fail (P/F) : ''
+    \item Withheld (W) : ''
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Statistical Analysis: Problem statement}
+  1. Read the data supplied in the file \emph{sslc1.txt} and carry out the following:
+  \begin{block}{}
+    Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science.    
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Problem statement: explanation}
+    \emphbar{Draw a pie chart representing proportion of students who scored more than 90\% in each region in Science.}
+    \begin{columns}
+    \column{5.25\textwidth}
+    \hspace*{.5in}
+    \includegraphics[height=2.6in, interpolate=true]{data/science}
+    \column{0.8\textwidth}
+\end{columns}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Machinery Required}
+  \begin{itemize}
+    \item File reading
+    \item Parsing
+    \item Dictionaries 
+    \item Arrays
+    \item Statistical operations
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Summary}
+  \begin{block}{lists}
+    \begin{itemize}
+    \item Creation.
+    \item Appending.
+    \item Iterating through list.
+    \end{itemize}
+  \end{block}
+  \begin{block}{Data processing}
+    \begin{itemize}
+    \item In form of lists.
+    \item Handling files.
+    \item for loops  
+    \end{itemize}  
+  \end{block}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Thank you!}  
+  \begin{block}{}
+  This session is part of \textcolor{blue}{FOSSEE} project funded by:
+  \begin{center}
+    \textcolor{blue}{NME through ICT from MHRD, Govt. of India}.
+  \end{center}  
+  \end{block}
+\end{frame}
+
+\end{document}
--- a/statistics-script	Sun Apr 11 03:07:55 2010 +0530
+++ b/statistics-script	Mon Apr 12 20:46:40 2010 +0530
@@ -1,5 +1,9 @@
 Hello friends and welcome to the third tutorial in the series of tutorials on "Python for scientific computing."
 
+This session is a continuation of the tutorial on Plotting Experimental data.
+
+We shall look at plotting experimental data using slightly advanced methods here. And then look into some statistical operations.
+
 In the previous tutorial we learnt how to read data from a file and plot it.
 We used 'for' loops and lists to get data in the desired format.
 IPython -Pylab also provides a function called 'loadtxt' that can get us the same data in the desired format without much hustle.
@@ -9,32 +13,35 @@
 
 l, t = loadtxt('pendulum.txt', unpack=True)
 
-(unpack = True) will give us all of first column(length) in l and second column(time) in t
+(unpack = True) will give us all the data in the first column which is the length in l and all the data in the second column which is the time period in t. Here both l and t are arrays. We shall look into what arrays are in subsequent tutorials.
 
 to know more about loadtxt type 
 
 loadtxt?
 This is a really powerful tool to load data directly from files which are well structured and formatted. It supports many features like getting selected columns only, or skipping rows. 
 
-Getting back to the problem, now to get squared values of t we can simply do
+Let's back to the problem, hit q to exit. Now to get squared values of t we can simply do
 
 tsq = t*t
 
-Note we dont have to use the 'for' loop anymore. This is the benefit of arrays. If we try to do the something similar using lists we won't be able to escape the 'for' loop.
+Note that we don't have to use the 'for' loop anymore. This is the benefit of arrays. If we try to do the something similar using lists we won't be able to escape the use of the 'for' loop.
 
 Let's now plot l vs tsq just as we did in the previous session
 
 plot(l, tsq, 'o')
 
-The basic equation for finding Time period of simple pendulum is:
+Let's continue with the pendulum expt to obtain the value of the acceleration due to gravity. The basic equation for finding Time period of simple pendulum is:
 
 T = 2*pi*sqrt(L/g)
 
+rearranging this equation we obtain the value of as
+g = 4 pi squared into l by t squared.
+
 In this case we have the values of t and l already, so to find g value for each element we can simply use:
 
 g = 4*pi^2*L/T^2
 
-g is array with 90 elements, we can take the average of all these values to get the acceleration due to gravity('g') by
+g here is array, we can take the average of all these values to get the acceleration due to gravity('g') by
 
 print mean(g)
 
--- a/statistics.txt	Sun Apr 11 03:07:55 2010 +0530
+++ b/statistics.txt	Mon Apr 12 20:46:40 2010 +0530
@@ -1,52 +1,61 @@
 Hello welcome to the tutorial on statistics and dictionaries in Python.
 
-In the previous tutorial we saw the `for' loop and lists. Here we shall look into
-calculating mean for the same pendulum experiment and then move on to calculate
-the mean, median and standard deviation for a very large data set.
-
-Let's start with calculating the mean acceleration due to gravity based on the data from pendulum.txt.
-
-We first create an empty list `g_list' to which we shall append the values of `g'.
-In []: g_list = []
+Till now we have covered:
+* How to create plots.
+* How to read data from file and process it.
 
-For each pair of `L' and `t' values in the file `pendulum.txt' we calculate the 
-value of `g' and append it to the list `g_list'
-In []: for line in open('pendulum.txt'):
-  ....     point = line.split()
-  ....     L = float(point[0])
-  ....     t = float(point[1])
-  ....     g = 4 * pi * pi * L / (t * t)
-  ....     g_list.append(g)
+In this session, we will use them and some new concepts to solve a problem/exercise. 
 
-We proceed to calculate the mean of the value of `g' from the list `g_list'. 
-Here we shall show three ways of calculating the mean. 
-Firstly, we calculate the sum `total' of the values in `g_list'.
-In []: total = 0
-In []: for g in g_list:
- ....:     total += g
- ....:
-
-Once we have the total we calculate by dividing the `total' by the length of `g_list'
+We have a file named sslc1.txt.
+It contains record of students and their performance in one of the State Secondary Board Examination.
+We can see the content of file by opening with any text editor.
+Please don't edit the data.
+It is arranged in a particular format.
+One particular line being:
+A;015163;JOSEPH RAJ S;083;042;47;AA;72;244;;;
+It has following fields:
+* Region Code which is 'A'
+* Roll Number 015163
+* Name JOSEPH RAJ S
+* Marks of 5 subjects: 
+  ** English 083
+  ** Hindi 042
+  ** Maths 47
+  ** Science AA (Absent)
+  ** Social 72
+* Total marks 244
+* Pass/Fail Blank cause he was absent in one exam or else it will be(P/F)
+* Withheld Blank in this case(W)
 
-In []: g_mean = total / len(g_list)
-In []: print 'Mean: ', g_mean
+So problem we are going to solve is:
+Draw a pie chart representing proportion of students who scored more than 90% in each region in Science.
+
+The result would be something like this:
+slide of result.
 
-The second method is slightly simpler. Python provides a built-in function called "sum()" that computes the sum of all the elements in a list. 
-In []: g_mean = sum(g_list) / len(g_list)
-In []: print 'Mean: ', g_mean
+We would be using following machinery:
+File Reading(done already)
+parsing (done partly)
+Dictionaries (new)
+Arrays
+Plot (done already)
 
-The third method is the simplest. Python provides a built-in function `mean' that
-calculates the mean of all the elements in a list.
-In []: g_mean = mean(g_list)
-In []: print 'Mean: ', g_mean
+Dictionaries
 
-Python provides support for dictionaries. Dictionaries are key value pairs. Lists are indexed by integers while dictionaries are indexed by strings. For example:
+We earlier used lists, we just created them and appended items to list. 
+x = [1, 4, 2, 7, 6]
+to access the first element we use index number, and it starts from 0 so
+x[0] will give
+1 and
+x[3] will
+7
+
+At times we don't have index to relate things. For example consider a telephone directory, we give it a name and it should return back corresponding number. List is not the best kind of data structure for such problems, and hence Python provides support for dictionaries. Dictionaries are key value pairs. Lists are indexed by integers while dictionaries are indexed by strings. For example:
+
 In []: d = {'png' : 'image',
       'txt' : 'text', 
       'py' : 'python'} 
-is a dictionary. The first element in the pair is called the `key' and the second 
-is called the `value'. The key always has to be a string while the value can be 
-of any type.
+d is a dictionary. The first element in the pair is called the `key' and the second is called the `value'. The key always has to be a string while the value can be of any type.
 
 Dictionaries are indexed using their keys as shown
 In []: d['txt']
@@ -56,22 +65,86 @@
 Out[]: 'image'
 
 The dictionaries can be searched for the presence of a certain key by typing
-In []: 'py' in d
-Out[]: True
+'py' in d
+True
 
-In []: 'jpg' in d
-Out[]: False
+'jpg' in d
+False
 Please note the values cannot be searched in a dictionaries.
 
-In []: d.keys()
-Out[]: ['py', 'txt', 'png']
+d.keys()
+['py', 'txt', 'png']
 is used to obtain the list of all keys in a dictionary
 
-In []: d.values()
-Out[]: ['python', 'text', 'image']
+d.values()
+['python', 'text', 'image']
 is used to obtain the list of all values in a dictionary
 
-In []: d
-Out[]: {'png': 'image', 'py': 'python', 'txt': 'text'}
-Please observe that dictionaries do not preserve the order in which the items
-were entered. The order of the elements in a dictionary should not be relied upon.
+d
+
+Please observe that dictionaries do not preserve the order in which the items were entered. The order of the elements in a dictionary should not be relied upon.
+
+------------------------------------------------------------------------------------------------------------------
+
+Parsing and string processing
+
+As we saw previously we will be dealing with lines with such content
+A;015162;JENIL T P;081;060;77;41;74;333;P;;
+so ';' is delimiter we have to look for.
+We will create one string variable to see how can we process it get the desired output.
+
+line = 'A;015162;JENIL T P;081;060;77;41;74;333;P;;'
+a = line.split(';')
+we have used split earlier to split on empty spaces.
+a 
+
+is list with all elements separated.
+a[0] is the region we want.
+and a[6] will give us the science marks of a particular region.
+So we create a dictionary of all the regions with number of students having more then 90 marks.
+Something like 
+d = {'A': 729, 'C': 764, 'B': 1120,'E': 414, 'D': 603, 'F': 500}
+
+------------------------------------------------------------------------------------------------------------------
+
+code
+
+We first create an empty dictionary
+
+science = {}
+now we read the record data one by one
+
+for record in open('sslc1.txt'):
+
+    we split the record on ';' and store the list in 'fields'
+    fields = record.split(';')
+
+    now we strip this string for leading and trailing white spaces
+    region_code = fields[0].strip()
+
+    now we check if the region code is always there in dictionary by writing 'if' statement
+    if region_code not in science:    
+       when this statement is true, we add new entry to dictionary with 
+       science[region_code] = 0
+
+    we again strip(ing is good) the string
+    score_str = fields[6].strip()
+
+    we check if student was not absent
+    if score_str != 'AA':
+       then we check if his marks are above 90 or not
+       if int(score_str) > 90:
+       	  science[region_code] += 1
+
+    Hit return twice
+
+by end of this loop we will have our desired output in the dictionary 'science'
+we can check the values by
+science
+
+now to create a pie chart we use
+
+pie(science.values(),labels = science.keys())
+title('Students scoring 90% and above in science by region')
+savefig('science.png')
+