In [32]:
import pandas as pd

df = pd.read_csv("data/sampledata.csv",sep=',',skiprows=3, header=0, index_col=0)
print(df)
       Country      2006      2007      2008      2009      2010      2011  \
Code                                                                         
ESP      Spain  44397319  45226803  45954106  46362946  46576897  46742697   
FRA     France  63621376  64016229  64374990  64707044  65027512  65342776   
GRC     Greece  11020362  11048473  11077841  11107017  11121341  11104899   
IRL    Ireland   4273591   4398942   4489544   4535375   4560155   4576794   
ITA      Italy  58143979  58438310  58826731  59095365  59277417  59379449   
MLT      Malta    405308    406724    409379    412477    414508    416268   
PRT   Portugal  10522288  10542964  10558177  10568247  10573100  10557560   
CYP     Cyprus   1048293   1063040   1077010   1090486   1103685   1116644   

          2012      2013      2014      2015  
Code                                          
ESP   46773055  46620045  46480882  46418269  
FRA   65659790  65972097  66495940  66808385  
GRC   11045011  10965211  10892413  10823732  
IRL    4586897   4598294   4617225   4640703  
ITA   59539717  60233948  60789140  60802085  
MLT     419455    423374    427364    431333  
PRT   10514844  10457295  10401062  10348648  
CYP    1129303   1141652   1153658   1165300  
In [29]:
## 1 ========= Accessing by Columns =======================
## 1.1 Get the first two columns 
# cn = df[[0,1]]
# cn

## 1.2 Get the last two columns 
# x = df.columns.size
# cn = df[[x-2,x-1]]
# cn
## alternatively: 
# cn = df[[len(df.columns)-2,len(df.columns)-1]]
# cn

## 1.3 Get one column by name (write three ways that this can be accomplished)
## Explore the type of the extracted part in each case  
# cn1 = df.Country
# cn2 = df['2007']
# cn3 = df[['2010']]
# type(cn3)

## 1.4 Get more than one columns by name 
## What is the type of the extracted data set?
# cn1 = df[['2006', '2008']]

## 1.5 Get three or more columns by position 
#Try to slice columns - What happens? 
# cn = df[[1,3,5]]
# cnsl = df[[1:3]]   #does not work
# cnsl = df[1:3]     #this works for rows

## 1.6 Try to slice columns by writing a list comprehension for selecting columns
# What happens?
# cn = df[[i for i in range(0,10,3)]]
# cn


# 2 ==== .loc===========================
## 2.1 Get onw row by label (write two ways to do it and explore the type of the new object)
# rs = df.loc['FRA']
# type(rs)

## 2.2 Get two rows by label and explore the type of the new object 
# rs = df.loc[['FRA', 'ITA']]
# rs

## 2.3 Slice the dataframe by rows (for example from 'FRA to 'ITA')
# rs = df.loc['FRA':'ITA']
# rs

## 2.4 Slice two rows by name and slice two columns also by name
#rs = df.loc['ESP':'FRA','Country':'2006']

## 2.5 Then produce the same outcome (two rows and two columns) without slicing
# rs = df.loc[['ESP','FRA']][['Country','2006']]
# rs


## 3 ==== .iloc===========================
## 3.1 Get the odd indexed columns(1,3,5..)
# cn = df.iloc[:,1::2]
# cn

## 3.2 Slice the odd position rows and the even position columns
# rs = df.iloc[1::2,0::2]
# rs

#===== .ix ===============================

## 3.3 Slice rows by label (for example odd indexed labels) and columns by position (even) 
# rs = df.ix['ESP'::2,0::2]
# rs

## 4 ===== [] ===============================
## 4.1 Access a single item without any [] 
# it = df.Country.ESP
# it

## 4.2 Access a single item with []
# it = df['Country']['GRC']
# it


## 5 ===== .at ===============================
## 5.1 Access a single item with row - column labels
# it = df.at['FRA','2006']
# it


## 6 ===== .iat ===============================
## 6.1 Access a single item with row - column indexes
# it = df.iat[2,3]
# it

## 6.2 Use len() to get the number of columns
#Then get a single item from the last column using row - column indexes
# cnum = len(df.columns)
# it = df.iat[3,cnum-1]
# it

## 6.3 Use len() to get the number of columns
# Use again len() to get the number of rows
# Then get the item in the intersection of last row/column using the previous values as indexes
# it = df.iat[len(df)-1,len(df.columns)-1]
# it


## 7 ======== Combining first-column then-row indexers====================
## 7.1 In column 'Country' get the value in the first row
# it = df.Country.iloc[0]
# it

## 7.2 In column '2010' get the value in the last row
# it = df['2010'].iloc[len(df)-1]
# it

## 7.3 In the last column get the item for the row 'ESP'
# it = df[[len(df.columns)-1]].loc['ESP']
# it


## 8 ======== Get the index when a value is known====================
## 8.1 Which are the indexes of the df DataFrame?
# df.index

## 8.2 Which are the indexes for column '2010'?
# df[['2010']].index

## 8.3 Which is the index for the Country with name 'Greece'?
#df.Country[df.Country=='Greece'].index.tolist()

## 8.4 Which country has population 4586897 in the year '2012'?
#df[df['2012']==4586897].index.tolist()

## 8.5 Which country has population 4586897 and in which year?
#Hint: Iterate over columns and rows to locate value 4586897
# for x in df.columns:
#     for y in df.index:
#         if df.loc[y,x]==4586897:
#             print(x,y)
#             break


## 9 ====== Applying Functions=======================
## 9.1 Which country has the highest population in 2006?
#Get the '2006' column (which is a Series object) 
# cn = df['2006']
# #Then apply argmax() to get the index
# cnmaxind = cn.argmax()
# print(cnmaxind)

## 9.2 In which year did Spain had the highest population?
#Get the 'ESP' row
# ro = df.loc['ESP']
#Then apply argmax() - Slicing from 1 and above to exclude the string value 'Spain'
# romaxind = ro[1:].argmax()
# romaxind


## 9.3 Compute the mean for all rows and all columns using the 'axis' property 
# explore the type of the new object constructed
# sm = df.mean(axis=0)
# sm


## 9.4 Apply the scipy.stats describe() function in various rows to get some descriptive statistics 
# import scipy.stats as st
# ro = df.loc['ESP']
# st.describe(ro.values[1:])
In [ ]:
#====================================================
#==== MODIFY Columns ================================

## 1) RENAME: 
## 1.1 Rename column 'Country' to 'Country Name'
# dfnew = df.rename(columns={'Country': 'Country name'})
# dfnew

## 2) ADD: 
## 2.1 Add a new column at the end with the average population for each country. 
## Name this column as 'Average' (use the previously constructed new dataframe)

# First construct a Series with the average values of each row 
# avg = df.mean(axis=1)
# # Then add the 'avg' Series to the 'df' DataFrame by assignment 
# dfnew['Average'] = avg
# dfnew


## 3) DELETE: 
## 3.1 Delete first column with label "Country name"
# del dfnew['Country name']
# dfnew


## 4) INSERT: 
## 4.1 Construct a new list (or Series with same labels) having in each position the highest value
## of population for the country. Insert this object as the FIRST column n the dataframe. 
## Label the column as 'Max Popu'
# smax = df.max(axis=1)
# dfnew.insert(0,'Max Popu',smax)
# dfnew

## 5) REARRANGE: 
## 5.1 Move the first column ('Max Val') so that it becomes the last in the DataFrame
# clist = dfnew.columns
# clist = clist[1:]|clist[0:1]
# dfnew = dfnew[clist]
# dfnew

## 6) REPLACE: 
## 6.1 Increase the values in '2006' column by 10% 
# dfnew[['2006']]=1.1*dfnew[['2006']]
# dfnew
In [52]:
#====================================================
#==== MODIFY Rows ===================================

## 1) ADD & CONCATENATE ===================================
# dfadd = df[['2014','2015']]                                 # Getting two columns from df 
# dfadd = dfadd.rename(columns={'2014':'2016','2015':'2017'}) # Renaming to 2016 and 2017 
# dfadd.loc['ALB']=[2780000,2900000]                          # Adding a new Row with .loc indexer 

# # Get the first two columns of dfnew for practical purposes
# dfnewp = df[[0,1]]
# print(dfnewp)
# print(dfadd)

# #Concatenating dfnew and dfadd
# pd.concat([dfnewp,dfadd], axis=1)
In [ ]:
#====================================================
#==== GROUPBY ===================================

## preparation of dataframe
# data = pd.read_excel('data/researchdata.xlsx', sheetname="groupby")
# data.head(5)

## Grouping @ Describing --> 4 groups
# mygroups = data.groupby(['Gender','Level'])
# mygroups.describe()

## iterating over 4 groups 
# for name, grp in mygroups:
#      print(name)
#      print(grp)

## Getting the two boys groups 
# boysK6 = mygroups.get_group(('b','K6'))
# boysK9 = mygroups.get_group(('b','K9'))

## Applying aggregate method to get mean and std over all groups  
# mystats = mygroups.agg([np.mean,np.std])
# print(mystats)

## Accessing a specific stat measure (std) through multi-indexing both in Columns and in Rows
# mystats['Performance','std'].loc['b'].loc['K6']