# --- Series -----------------
import pandas as pd
data = [12,15,18,13,18]
index = ['s1','s2','s3','s4','s5']
# construct a Series object
ser = pd.Series(data,index)
# Series has a name attribute
ser.name = 'Test'
print(ser.name)
# Series.values and Series.index
print(ser.values)
print(ser.index,'\n')
# ndarray-like behavior
print(ser[0])
print(ser.mean(), ser.std())
print(ser[ser < ser.mean()-ser.std()],'\n')
# dict-like behavior
print(ser['s1'])
print(ser['s1']+ser['s2'],'\n')
for x in ser.items():
print(x)
# vectorization
print(ser * 2)
# automatic alignment
ser2 = ser[::-1] # inverse ser
print(ser2)
print(ser + ser2)
# --- Plotting ------------
import matplotlib.pyplot as plt
%matplotlib inline
#n, bins, patches = plt.hist(ser.values, color='cyan', alpha=0.5)
p = ser.plot(kind='hist', color='blue', alpha=0.5)
# --- DataFrame --------------
data = {'Μαθηματικά':[12,15,18,13,19],
'Γλώσσα':[11,14,17,12,18],
'Φυσική':[13,15,19,12,20]}
index = ['s1','s2','s3','s4','s5']
df = pd.DataFrame(data,index)
print(df)
# access by column
print(df.Γλώσσα) # Unicode συμβατή, αλλά μην χρησιμοποιείτε ελληνικούς χαρακτήρες
print(df['Φυσική']) # καλύτερα η πρόσβαση στις στήλες να γίνεται έτσι
print(df[[0]])
# access by row - use .loc & .iloc indexers
print(df.loc['s1'])
print(df.loc[['s2','s3']])
print(df.iloc[[0,1]])
# access by item - use .at & .iat indexers
print(df.at['s1','Φυσική'])
print(df.iat[0,2])
df.iat[0,2] = 10
print(df.loc[['s1']])
# --- Plotting ------------
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('font', family='Arial')
# --- Plot Column data -----------
sf = df.xs('Φυσική', axis=1)
p1 = sf.plot(kind='bar', title='Φυσική', align = 'center', yticks=[10, 20])
plt.show()
p2 = sf.plot(kind='hist', title='Φυσική', align = 'left', yticks=[1, 2])
plt.show()
# --- Plot Row data -----------
ser = df.xs('s4')
p = ser.plot(kind='bar', title='S1 ΒΑΘΜΟΙ', align = 'center', yticks=[10, 20])
plt.show()
# --- Plot Column vs. Columns data -----------
p = df.plot('Φυσική', 'Γλώσσα', kind='scatter')
plt.show()
# --- Pearson Correlation ----------------------
import scipy.stats as st
ph = df['Φυσική']
gl = df['Γλώσσα']
print(st.pearsonr(ph, gl))
# -- Groupby -----------------
import pandas as pd
data = pd.read_excel("etpe2017.xlsx",sheetname="groupby",
index_col=None,convert_float=False)
#print(data)
# --- Groupby------
# --- 2 groups -----
my2groups = data.groupby('Gender')
print(my2groups.describe())
boys = my2groups.get_group('b')
print(boys)
girls = my2groups.get_group('g')
print(girls)
# --- 4 groups -----
print('-------------------------')
my4groups = data.groupby(['Gender','Level'])
print(my4groups.describe())
boysK6 = my4groups.get_group(('b','K6'))
print(boysK6.Performance.mean())
# ----- t-test using pandas ---------------
import pandas as pd
import scipy.stats as stats
# --- Εισαγωγή δεδομένων από αρχείο xlsx ----------------------
data = pd.read_excel("etpe2017.xlsx",sheetname="ttest-indep",
index_col=None,convert_float=False)
# --- Εισαγωγή δεδομένων από αρχείο csv ----------------------
# df = pd.read_csv("etpe2017.csv",sep=',',
# skiprows=3, header=0, index_col=0)
# print(df)
Control = data.Control.dropna()
print(Control.describe())
mnControl = Control.mean()
stdControl = Control.std()
print()
Treatment = data.Treatment.dropna()
print(Treatment.describe())
mnTreatment = Treatment.mean()
print('Normality Control:', round(stats.shapiro(Control)[1],4))
print('Normality Treatment:',round(stats.shapiro(Treatment)[1],4))
print('Between Group Variance:',stats.levene(Control,Treatment)[1])
t, p = stats.ttest_ind(Control, Treatment)
print('t = ',round(t,4),'p = ',round(p,4))
if p<=0.05:
print('Sig.')
# small <= 0.2 Medium <=0.8 LARGE
es = abs(mnControl-mnTreatment)/stdControl
print('Effect size = ',round(es,4))
# --- ANOVA ----------------
import pandas as pd
import scipy.stats as stats
data = pd.read_excel("etpe2017.xlsx",sheetname="anova",
index_col=None,convert_float=False)
dC = data.Control.dropna()
dT1 = data.Treatment1.dropna()
dT2 = data.Treatment2.dropna()
print('Descriptive Stats')
# print('Control\n',dC.describe())
# print('Treatment1\n',dT1.describe())
# print('Treatment2\n',dT2.describe())
print('Normality test')
print('Control',stats.shapiro(dC))
print('Treatment1', stats.shapiro(dT1))
print('Treatment2',stats.shapiro(dT2))
print('Variance test')
print(stats.levene(dC, dT1, dT2))
F, p = stats.f_oneway(dC, dT1, dT2)
print('F statistic = {:5.3f} and probability p = {:5.3f}'.format(F, p))
# --- Reshape Data --------------------------
import numpy as np
# --- Tukey's t-test --------------------------
import statsmodels.stats.multicomp as ml
# 1η Λύση: διάβασμα έτοιμων δεδομένων από αρχείο
# data_tukey = pd.read_excel('etpe2017.xlsx', sheetname="tukey")
# 2η Λύση: ετοιμασία δεδομένων με κώδικα
cval = dC.values
t1val = dT1.values
t2val = dT2.values
arScore = np.concatenate((cval, t1val, t2val))
csymbol = ['c' for i in range(len(cval))]
t1symbol = ['t1' for i in range(len(t1val))]
t2symbol = ['t2' for i in range(len(t2val))]
arGroup = np.concatenate((csymbol, t1symbol, t2symbol))
di = {'Score':arScore,
'Group':arGroup}
data_tukey = pd.DataFrame(di)
# η μορφή των δεδομένων (ενδεικτικά)
print(data_tukey.head(),'\n')
# κλήση της κλάσης MultiComparison & της μεθόδου tukeyhsd
tukey_object = ml.MultiComparison(data_tukey.Score, data_tukey.Group)
out_tukey = tukey_object.tukeyhsd(0.05)
print(out_tukey)