In [6]:
# --- Series -----------------

import pandas as pd 

data = [12,15,18,13,18]
index = ['s1','s2','s3','s4','s5']

# construct a Series object
ser = pd.Series(data,index)

# Series has a name attribute
ser.name = 'Test'
print(ser.name)

# Series.values and Series.index
print(ser.values)
print(ser.index,'\n')

# ndarray-like behavior
print(ser[0])
print(ser.mean(), ser.std())
print(ser[ser < ser.mean()-ser.std()],'\n')

# dict-like behavior
print(ser['s1'])
print(ser['s1']+ser['s2'],'\n')
for x in ser.items():
    print(x)

# vectorization 
print(ser * 2)

# automatic alignment 
ser2 = ser[::-1]  # inverse ser
print(ser2)
print(ser + ser2)

# --- Plotting ------------
import matplotlib.pyplot as plt
%matplotlib inline

#n, bins, patches = plt.hist(ser.values, color='cyan', alpha=0.5)
p = ser.plot(kind='hist', color='blue', alpha=0.5)
Test
[12 15 18 13 18]
Index(['s1', 's2', 's3', 's4', 's5'], dtype='object') 

12
15.2 2.7748873851023212
s1    12
Name: Test, dtype: int64 

12
27 

('s1', 12)
('s2', 15)
('s3', 18)
('s4', 13)
('s5', 18)
s1    24
s2    30
s3    36
s4    26
s5    36
Name: Test, dtype: int64
s5    18
s4    13
s3    18
s2    15
s1    12
Name: Test, dtype: int64
s1    24
s2    30
s3    36
s4    26
s5    36
Name: Test, dtype: int64
In [19]:
# --- DataFrame --------------

data = {'Μαθηματικά':[12,15,18,13,19],
       'Γλώσσα':[11,14,17,12,18],
       'Φυσική':[13,15,19,12,20]}
index = ['s1','s2','s3','s4','s5']

df = pd.DataFrame(data,index)
print(df)

# access by column
print(df.Γλώσσα)      # Unicode συμβατή, αλλά μην χρησιμοποιείτε ελληνικούς χαρακτήρες
print(df['Φυσική'])   # καλύτερα η πρόσβαση στις στήλες να γίνεται έτσι
print(df[[0]])

# access by row - use .loc & .iloc indexers
print(df.loc['s1'])
print(df.loc[['s2','s3']])  
print(df.iloc[[0,1]])

# access by item - use .at & .iat indexers
print(df.at['s1','Φυσική'])
print(df.iat[0,2])
df.iat[0,2] = 10
print(df.loc[['s1']])

# --- Plotting ------------

import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('font', family='Arial')

# --- Plot Column data -----------
sf = df.xs('Φυσική', axis=1)
p1 = sf.plot(kind='bar', title='Φυσική', align = 'center', yticks=[10, 20])
plt.show()
p2 = sf.plot(kind='hist', title='Φυσική', align = 'left', yticks=[1, 2])
plt.show()

# --- Plot Row data -----------
ser = df.xs('s4')
p = ser.plot(kind='bar', title='S1 ΒΑΘΜΟΙ', align = 'center', yticks=[10, 20])
plt.show()

# --- Plot Column vs. Columns data -----------
p = df.plot('Φυσική', 'Γλώσσα', kind='scatter')
plt.show()

# --- Pearson Correlation ----------------------
import scipy.stats as st 
ph = df['Φυσική']
gl = df['Γλώσσα']
print(st.pearsonr(ph, gl))
    Γλώσσα  Μαθηματικά  Φυσική
s1      11          12      13
s2      14          15      15
s3      17          18      19
s4      12          13      12
s5      18          19      20
s1    11
s2    14
s3    17
s4    12
s5    18
Name: Γλώσσα, dtype: int64
s1    13
s2    15
s3    19
s4    12
s5    20
Name: Φυσική, dtype: int64
    Γλώσσα
s1      11
s2      14
s3      17
s4      12
s5      18
Γλώσσα        11
Μαθηματικά    12
Φυσική        13
Name: s1, dtype: int64
    Γλώσσα  Μαθηματικά  Φυσική
s2      14          15      15
s3      17          18      19
    Γλώσσα  Μαθηματικά  Φυσική
s1      11          12      13
s2      14          15      15
13
13
    Γλώσσα  Μαθηματικά  Φυσική
s1      11          12      10
(0.99715684605335475, 0.00018190687773923228)
In [23]:
# -- Groupby -----------------

import pandas as pd

data = pd.read_excel("etpe2017.xlsx",sheetname="groupby",
                     index_col=None,convert_float=False)
#print(data)

# --- Groupby------
# --- 2 groups -----

my2groups = data.groupby('Gender')
print(my2groups.describe())

boys = my2groups.get_group('b')
print(boys)
girls = my2groups.get_group('g')
print(girls)

# --- 4 groups -----
print('-------------------------')
my4groups = data.groupby(['Gender','Level'])
print(my4groups.describe())

boysK6 = my4groups.get_group(('b','K6'))
print(boysK6.Performance.mean())
              Performance
Gender                   
b      count    21.000000
       mean     59.761905
       std       7.154752
       min      45.000000
       25%      55.000000
       50%      60.000000
       75%      65.000000
       max      70.000000
g      count    19.000000
       mean     75.789474
       std       7.685332
       min      60.000000
       25%      70.000000
       50%      75.000000
       75%      80.000000
       max      90.000000
   Level  Performance
0     K6         60.0
1     K6         65.0
5     K6         65.0
6     K6         60.0
7     K6         65.0
8     K6         70.0
13    K6         45.0
14    K6         50.0
15    K6         55.0
16    K6         60.0
17    K6         65.0
21    K9         65.0
25    K9         65.0
26    K9         60.0
27    K9         65.0
33    K9         45.0
34    K9         50.0
35    K9         55.0
36    K9         60.0
37    K9         65.0
39    K9         65.0
   Level  Performance
2     K6         70.0
3     K6         75.0
4     K6         70.0
9     K6         75.0
10    K6         80.0
11    K6         85.0
12    K6         90.0
18    K9         70.0
19    K9         75.0
20    K9         60.0
22    K9         70.0
23    K9         75.0
24    K9         70.0
28    K9         70.0
29    K9         75.0
30    K9         80.0
31    K9         85.0
32    K9         90.0
38    K9         75.0
-------------------------
                    Performance
Gender Level                   
b      K6    count    11.000000
             mean     60.000000
             std       7.416198
             min      45.000000
             25%      57.500000
             50%      60.000000
             75%      65.000000
             max      70.000000
       K9    count    10.000000
             mean     59.500000
             std       7.245688
             min      45.000000
             25%      56.250000
             50%      62.500000
             75%      65.000000
             max      65.000000
g      K6    count     7.000000
             mean     77.857143
             std       7.559289
             min      70.000000
             25%      72.500000
             50%      75.000000
             75%      82.500000
             max      90.000000
       K9    count    12.000000
             mean     74.583333
             std       7.821396
             min      60.000000
             25%      70.000000
             50%      75.000000
             75%      76.250000
             max      90.000000
60.0
In [24]:
# ----- t-test using pandas ---------------
import pandas as pd
import scipy.stats as stats

# --- Εισαγωγή δεδομένων από αρχείο xlsx ----------------------
data = pd.read_excel("etpe2017.xlsx",sheetname="ttest-indep",
                     index_col=None,convert_float=False)

# --- Εισαγωγή δεδομένων από αρχείο csv ----------------------

# df = pd.read_csv("etpe2017.csv",sep=',',
#                  skiprows=3, header=0, index_col=0)
# print(df)

Control = data.Control.dropna()
print(Control.describe())
mnControl = Control.mean()
stdControl = Control.std()

print()
Treatment = data.Treatment.dropna() 
print(Treatment.describe())
mnTreatment = Treatment.mean()

print('Normality Control:', round(stats.shapiro(Control)[1],4))
print('Normality Treatment:',round(stats.shapiro(Treatment)[1],4))
print('Between Group Variance:',stats.levene(Control,Treatment)[1])
t, p = stats.ttest_ind(Control, Treatment)

print('t = ',round(t,4),'p = ',round(p,4))
if p<=0.05:
    print('Sig.')

# small <= 0.2 Medium <=0.8 LARGE 
es = abs(mnControl-mnTreatment)/stdControl
print('Effect size = ',round(es,4))
count    38.000000
mean     67.236842
std      11.131728
min      45.000000
25%      60.000000
50%      65.000000
75%      75.000000
max      90.000000
Name: Control, dtype: float64

count     40.000000
mean      76.625000
std       11.231109
min       55.000000
25%       70.000000
50%       75.000000
75%       85.000000
max      100.000000
Name: Treatment, dtype: float64
Normality Control: 0.3765
Normality Treatment: 0.3397
Between Group Variance: 0.965185963804
t =  -3.706 p =  0.0004
Sig.
Effect size =  0.8434
In [25]:
# --- ANOVA ----------------

import pandas as pd
import scipy.stats as stats

data = pd.read_excel("etpe2017.xlsx",sheetname="anova",
                     index_col=None,convert_float=False)

dC = data.Control.dropna()
dT1 = data.Treatment1.dropna()
dT2 = data.Treatment2.dropna()

print('Descriptive Stats')
# print('Control\n',dC.describe())
# print('Treatment1\n',dT1.describe())
# print('Treatment2\n',dT2.describe())

print('Normality test')
print('Control',stats.shapiro(dC))
print('Treatment1', stats.shapiro(dT1))
print('Treatment2',stats.shapiro(dT2))

print('Variance test')
print(stats.levene(dC, dT1, dT2))

F, p = stats.f_oneway(dC, dT1, dT2)
print('F statistic = {:5.3f} and probability p = {:5.3f}'.format(F, p)) 


# --- Reshape Data --------------------------
import numpy as np 

# --- Tukey's t-test --------------------------
import statsmodels.stats.multicomp as ml

# 1η Λύση: διάβασμα έτοιμων δεδομένων από αρχείο
# data_tukey = pd.read_excel('etpe2017.xlsx', sheetname="tukey")

# 2η Λύση: ετοιμασία δεδομένων με κώδικα
cval = dC.values
t1val = dT1.values
t2val = dT2.values
arScore = np.concatenate((cval, t1val, t2val))

csymbol = ['c' for i in range(len(cval))]
t1symbol = ['t1' for i in range(len(t1val))]
t2symbol = ['t2' for i in range(len(t2val))]
arGroup = np.concatenate((csymbol, t1symbol, t2symbol))

di = {'Score':arScore,
     'Group':arGroup}

data_tukey = pd.DataFrame(di)

# η μορφή των δεδομένων (ενδεικτικά)
print(data_tukey.head(),'\n')

# κλήση της κλάσης MultiComparison & της μεθόδου tukeyhsd
tukey_object = ml.MultiComparison(data_tukey.Score, data_tukey.Group)
out_tukey = tukey_object.tukeyhsd(0.05) 

print(out_tukey)
Descriptive Stats
Normality test
Control (0.9695044159889221, 0.3470972180366516)
Treatment1 (0.9589287638664246, 0.13518860936164856)
Treatment2 (0.9781549572944641, 0.5761716961860657)
Variance test
LeveneResult(statistic=0.054610924231632968, pvalue=0.9468766068380543)
F statistic = 4.668 and probability p = 0.011
  Group  Score
0     c   62.0
1     c   67.0
2     c   72.0
3     c   77.0
4     c   72.0 

Multiple Comparison of Means - Tukey HSD,FWER=0.05
==============================================
group1 group2 meandiff  lower    upper  reject
----------------------------------------------
  c      t1    5.9286   0.197   11.6601  True 
  c      t2   -0.7093  -6.4082   4.9896 False 
  t1     t2   -6.6379  -12.2661 -1.0097  True 
----------------------------------------------