import pandas as pd
df = pd.read_csv("data/sampledata.csv",sep=',',skiprows=3, header=0, index_col=0)
print(df)
## 1 ========= Accessing by Columns =======================
## 1.1 Get the first two columns
# cn = df[[0,1]]
# cn
## 1.2 Get the last two columns
# x = df.columns.size
# cn = df[[x-2,x-1]]
# cn
## alternatively:
# cn = df[[len(df.columns)-2,len(df.columns)-1]]
# cn
## 1.3 Get one column by name (write three ways that this can be accomplished)
## Explore the type of the extracted part in each case
# cn1 = df.Country
# cn2 = df['2007']
# cn3 = df[['2010']]
# type(cn3)
## 1.4 Get more than one columns by name
## What is the type of the extracted data set?
# cn1 = df[['2006', '2008']]
## 1.5 Get three or more columns by position
#Try to slice columns - What happens?
# cn = df[[1,3,5]]
# cnsl = df[[1:3]] #does not work
# cnsl = df[1:3] #this works for rows
## 1.6 Try to slice columns by writing a list comprehension for selecting columns
# What happens?
# cn = df[[i for i in range(0,10,3)]]
# cn
# 2 ==== .loc===========================
## 2.1 Get onw row by label (write two ways to do it and explore the type of the new object)
# rs = df.loc['FRA']
# type(rs)
## 2.2 Get two rows by label and explore the type of the new object
# rs = df.loc[['FRA', 'ITA']]
# rs
## 2.3 Slice the dataframe by rows (for example from 'FRA to 'ITA')
# rs = df.loc['FRA':'ITA']
# rs
## 2.4 Slice two rows by name and slice two columns also by name
#rs = df.loc['ESP':'FRA','Country':'2006']
## 2.5 Then produce the same outcome (two rows and two columns) without slicing
# rs = df.loc[['ESP','FRA']][['Country','2006']]
# rs
## 3 ==== .iloc===========================
## 3.1 Get the odd indexed columns(1,3,5..)
# cn = df.iloc[:,1::2]
# cn
## 3.2 Slice the odd position rows and the even position columns
# rs = df.iloc[1::2,0::2]
# rs
#===== .ix ===============================
## 3.3 Slice rows by label (for example odd indexed labels) and columns by position (even)
# rs = df.ix['ESP'::2,0::2]
# rs
## 4 ===== [] ===============================
## 4.1 Access a single item without any []
# it = df.Country.ESP
# it
## 4.2 Access a single item with []
# it = df['Country']['GRC']
# it
## 5 ===== .at ===============================
## 5.1 Access a single item with row - column labels
# it = df.at['FRA','2006']
# it
## 6 ===== .iat ===============================
## 6.1 Access a single item with row - column indexes
# it = df.iat[2,3]
# it
## 6.2 Use len() to get the number of columns
#Then get a single item from the last column using row - column indexes
# cnum = len(df.columns)
# it = df.iat[3,cnum-1]
# it
## 6.3 Use len() to get the number of columns
# Use again len() to get the number of rows
# Then get the item in the intersection of last row/column using the previous values as indexes
# it = df.iat[len(df)-1,len(df.columns)-1]
# it
## 7 ======== Combining first-column then-row indexers====================
## 7.1 In column 'Country' get the value in the first row
# it = df.Country.iloc[0]
# it
## 7.2 In column '2010' get the value in the last row
# it = df['2010'].iloc[len(df)-1]
# it
## 7.3 In the last column get the item for the row 'ESP'
# it = df[[len(df.columns)-1]].loc['ESP']
# it
## 8 ======== Get the index when a value is known====================
## 8.1 Which are the indexes of the df DataFrame?
# df.index
## 8.2 Which are the indexes for column '2010'?
# df[['2010']].index
## 8.3 Which is the index for the Country with name 'Greece'?
#df.Country[df.Country=='Greece'].index.tolist()
## 8.4 Which country has population 4586897 in the year '2012'?
#df[df['2012']==4586897].index.tolist()
## 8.5 Which country has population 4586897 and in which year?
#Hint: Iterate over columns and rows to locate value 4586897
# for x in df.columns:
# for y in df.index:
# if df.loc[y,x]==4586897:
# print(x,y)
# break
## 9 ====== Applying Functions=======================
## 9.1 Which country has the highest population in 2006?
#Get the '2006' column (which is a Series object)
# cn = df['2006']
# #Then apply argmax() to get the index
# cnmaxind = cn.argmax()
# print(cnmaxind)
## 9.2 In which year did Spain had the highest population?
#Get the 'ESP' row
# ro = df.loc['ESP']
#Then apply argmax() - Slicing from 1 and above to exclude the string value 'Spain'
# romaxind = ro[1:].argmax()
# romaxind
## 9.3 Compute the mean for all rows and all columns using the 'axis' property
# explore the type of the new object constructed
# sm = df.mean(axis=0)
# sm
## 9.4 Apply the scipy.stats describe() function in various rows to get some descriptive statistics
# import scipy.stats as st
# ro = df.loc['ESP']
# st.describe(ro.values[1:])
#====================================================
#==== MODIFY Columns ================================
## 1) RENAME:
## 1.1 Rename column 'Country' to 'Country Name'
# dfnew = df.rename(columns={'Country': 'Country name'})
# dfnew
## 2) ADD:
## 2.1 Add a new column at the end with the average population for each country.
## Name this column as 'Average' (use the previously constructed new dataframe)
# First construct a Series with the average values of each row
# avg = df.mean(axis=1)
# # Then add the 'avg' Series to the 'df' DataFrame by assignment
# dfnew['Average'] = avg
# dfnew
## 3) DELETE:
## 3.1 Delete first column with label "Country name"
# del dfnew['Country name']
# dfnew
## 4) INSERT:
## 4.1 Construct a new list (or Series with same labels) having in each position the highest value
## of population for the country. Insert this object as the FIRST column n the dataframe.
## Label the column as 'Max Popu'
# smax = df.max(axis=1)
# dfnew.insert(0,'Max Popu',smax)
# dfnew
## 5) REARRANGE:
## 5.1 Move the first column ('Max Val') so that it becomes the last in the DataFrame
# clist = dfnew.columns
# clist = clist[1:]|clist[0:1]
# dfnew = dfnew[clist]
# dfnew
## 6) REPLACE:
## 6.1 Increase the values in '2006' column by 10%
# dfnew[['2006']]=1.1*dfnew[['2006']]
# dfnew
#====================================================
#==== MODIFY Rows ===================================
## 1) ADD & CONCATENATE ===================================
# dfadd = df[['2014','2015']] # Getting two columns from df
# dfadd = dfadd.rename(columns={'2014':'2016','2015':'2017'}) # Renaming to 2016 and 2017
# dfadd.loc['ALB']=[2780000,2900000] # Adding a new Row with .loc indexer
# # Get the first two columns of dfnew for practical purposes
# dfnewp = df[[0,1]]
# print(dfnewp)
# print(dfadd)
# #Concatenating dfnew and dfadd
# pd.concat([dfnewp,dfadd], axis=1)
#====================================================
#==== GROUPBY ===================================
## preparation of dataframe
# data = pd.read_excel('data/researchdata.xlsx', sheetname="groupby")
# data.head(5)
## Grouping @ Describing --> 4 groups
# mygroups = data.groupby(['Gender','Level'])
# mygroups.describe()
## iterating over 4 groups
# for name, grp in mygroups:
# print(name)
# print(grp)
## Getting the two boys groups
# boysK6 = mygroups.get_group(('b','K6'))
# boysK9 = mygroups.get_group(('b','K9'))
## Applying aggregate method to get mean and std over all groups
# mystats = mygroups.agg([np.mean,np.std])
# print(mystats)
## Accessing a specific stat measure (std) through multi-indexing both in Columns and in Rows
# mystats['Performance','std'].loc['b'].loc['K6']