Prev: - | Next: t-Test (Independent samples)
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv("../../data/researchdata.csv", sep=' ') # why do we need sep? check data.csv
print(data.head()) # head() method prints only the first 5 lines (for less/more: provide a numerical argument)
print('\n..............\n')
print(data.tail()) # tail() method prints only the last 5 lines (for less/more: provide a numerical argument)
#We get 'Control' group as case study for descriptive statistics
dc = data.Control
1) Distributed across a range of values
2) Exhibit a tendency to center around specific values (mean, median, mode), and
3) Exhibit variation and disperse around these specific values
- index: the frequency of unique values in the Series
- values: the unique values themselves
frd = dc.value_counts()
frd
x = frd.index.values-1.5 # positioning the cyan bars 1.5 point to the left
pd = plt.bar(x, frd, width=3.0, color='cyan')
dfcontr = frd.to_frame() # a) convert 'frd' Series to DataFrame with Series.to_frame()
dfcontr.index.name = 'Values' # Set 'dfcontr' DataFrame index name
dfcontr.columns = ['Frequency'] # Set 'dfcontr' column name
from IPython.display import HTML # import HTML library
HTML(dfcontr.to_html()) # Call to_html() method to print the tabular form of DataFrame
- (a) Appropriate numpy/scipy specific statistical funstions and/or
- (b) the describe() function
dc.mean(), dc.median(), dc.std(), dc.var(), dc.min(), dc.max(), dc.skew(), dc.kurt()
# Use describe()
dc.describe()
dc.cumsum().tail()
dc.count()
data.corr(method='spearman')
data.cov()
However, the standard deviation is the measure typically used in statistics to calculate and express the degree of dispersion of a set of data and it is defined as the square root of the variance:
standard deviation = sqrt(variance)
The important thing you need to know about standard deviation, is: standard deviation of popoulation mean ('σ') is calculated slightly different from standard deviation of sample mean ('s') as you can see in the figure below.
import numpy as np
import statistics as st
import pandas as pd
def mystd(dt, kind='sample'): # kind: {sample, population, unbiased}
# N: data size
N = len(dt)
#print(N)
# M: mean
M = dt.mean()
#print(M)
#SS: Sum of Squares
SS = np.sum(pow((dt-M),2))
#print(SS)
#variance
if kind == 'sample':
var = SS/(N-1)
elif kind == 'population':
var = SS/N
elif kind == 'unbiased':
var = SS/(N-1.5)
else:
return 'kind should be {population, sample, unbiased}'
return np.sqrt(var)
ar = np.array([1,2,3,4,5,6,7,8,9,10])
print(ar.std())
print(st.stdev(ar))
print(mystd(ar, kind='sample'))
. Free learning material
. See full copyright and disclaimer notice