%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
Rozdział 2.1.2
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('heights')
heights = r['heights']
print(heights.head(2))
Husband Wife
1 186 175
2 180 168
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(7,6))
ax = fig.add_subplot(1,1,1)
ax.plot(heights["Wife"],heights["Husband"],'.',markersize=10)
plt.xlabel("Wife")
plt.ylabel("Husband")
fig.tight_layout()
import statsmodels.formula.api as smf
model1 = smf.ols('Husband ~ Wife', data=heights, missing='drop').fit()
print(model1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Husband R-squared: 0.583
Model: OLS Adj. R-squared: 0.578
Method: Least Squares F-statistic: 131.3
Date: pon, 01 sty 2018 Prob (F-statistic): 1.54e-19
Time: 11:37:55 Log-Likelihood: -314.43
No. Observations: 96 AIC: 632.9
Df Residuals: 94 BIC: 638.0
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 37.8100 11.932 3.169 0.002 14.118 61.502
Wife 0.8329 0.073 11.458 0.000 0.689 0.977
==============================================================================
Omnibus: 0.324 Durbin-Watson: 1.986
Prob(Omnibus): 0.850 Jarque-Bera (JB): 0.303
Skew: 0.130 Prob(JB): 0.860
Kurtosis: 2.912 Cond. No. 2.97e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.97e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
import statsmodels.formula.api as smf
model2 = smf.ols('Wife ~ Husband', data=heights, missing='drop').fit()
print(model2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Wife R-squared: 0.583
Model: OLS Adj. R-squared: 0.578
Method: Least Squares F-statistic: 131.3
Date: pon, 01 sty 2018 Prob (F-statistic): 1.54e-19
Time: 11:37:55 Log-Likelihood: -306.06
No. Observations: 96 AIC: 616.1
Df Residuals: 94 BIC: 621.2
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 41.9302 10.662 3.933 0.000 20.761 63.099
Husband 0.6997 0.061 11.458 0.000 0.578 0.821
==============================================================================
Omnibus: 5.735 Durbin-Watson: 1.824
Prob(Omnibus): 0.057 Jarque-Bera (JB): 5.268
Skew: -0.564 Prob(JB): 0.0718
Kurtosis: 3.214 Cond. No. 3.08e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.08e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
import statsmodels.formula.api as smf
model3 = smf.ols('Husband ~ Wife-1', data=heights, missing='drop').fit()
print(model3.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Husband R-squared: 0.999
Model: OLS Adj. R-squared: 0.998
Method: Least Squares F-statistic: 6.378e+04
Date: pon, 01 sty 2018 Prob (F-statistic): 3.98e-136
Time: 11:37:55 Log-Likelihood: -319.30
No. Observations: 96 AIC: 640.6
Df Residuals: 95 BIC: 643.2
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Wife 1.0629 0.004 252.556 0.000 1.055 1.071
==============================================================================
Omnibus: 3.309 Durbin-Watson: 1.886
Prob(Omnibus): 0.191 Jarque-Bera (JB): 2.665
Skew: 0.379 Prob(JB): 0.264
Kurtosis: 3.305 Cond. No. 1.00
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import statsmodels.stats.api as sms
anovaResults = sms.anova_lm(model3, model1)
print(anovaResults)
df_resid ssr df_diff ss_diff F Pr(>F)
0 95.0 4352.548638 0.0 NaN NaN NaN
1 94.0 3932.494048 1.0 420.054591 10.040735 0.002066
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
se, ci_l, ci_u = wls_prediction_std(model1, exog=[1,170], alpha=0.05)
print(pd.DataFrame({'lower':ci_l,'upper':ci_u,'error':se,'fit':model1.predict(exog=dict(Wife=[170]))[0]}))
error fit lower upper
0 6.516726 179.407227 166.468115 192.34634
Rozdział 2.1.3
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('genomes')
genomes = r['genomes']
print(genomes.head(2))
organism group size GC habitat \
1 Acaryochloris marina MBIC11017 Cyanobacteria 8.36 47.0 Aquatic
2 Acholeplasma laidlawii PG-8A Firmicutes 1.50 31.9 Specialized
temp.group temperature
1 Mesophilic NaN
2 Mesophilic 37.0
import seaborn as sns
import numpy as np
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
sns.regplot('size', 'GC', data=genomes, fit_reg=True, ci=None, line_kws={'color': 'C1'}, scatter_kws={'alpha':'0.3'}, ax=ax1)
sns.regplot('size', 'GC', data=genomes, fit_reg=False, ci=None, scatter_kws={'color': 'C0','alpha':'0.3'}, ax=ax2)
ax1.set_title('(a) skala liniowa')
ax2.set_title('(b) skala logarytmiczna')
l_size = np.log(genomes['size'])
l_GC = np.log(genomes['GC'])
m, c = np.polyfit(l_size, l_GC, 1)
y_fit = np.exp(m*l_size + c)
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.plot(genomes['size'], y_fit,'C1')
plt.tight_layout()
model_gen_1 = smf.ols('GC ~ size', data=genomes, missing='drop').fit()
print(model_gen_1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: GC R-squared: 0.375
Model: OLS Adj. R-squared: 0.375
Method: Least Squares F-statistic: 434.1
Date: pon, 01 sty 2018 Prob (F-statistic): 7.51e-76
Time: 11:37:56 Log-Likelihood: -2720.2
No. Observations: 724 AIC: 5444.
Df Residuals: 722 BIC: 5453.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 32.6825 0.824 39.687 0.000 31.066 34.299
size 4.2529 0.204 20.835 0.000 3.852 4.654
==============================================================================
Omnibus: 16.973 Durbin-Watson: 0.903
Prob(Omnibus): 0.000 Jarque-Bera (JB): 11.320
Skew: 0.173 Prob(JB): 0.00348
Kurtosis: 2.495 Cond. No. 9.03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print('LogLik:',model_gen_1.llf)
LogLik: -2720.15765463
print(model_gen_1.cov_params(scale=1))
Intercept size
Intercept 0.006298 -0.001379
size -0.001379 0.000387
model_gen_1.params
Intercept 32.682470
size 4.252881
dtype: float64
print('R2:', model_gen_1.rsquared)
R2: 0.375482150502
model_gen_2 = smf.ols('np.log(GC) ~ np.log(size)', data=genomes, missing='drop').fit()
print(model_gen_2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: np.log(GC) R-squared: 0.411
Model: OLS Adj. R-squared: 0.410
Method: Least Squares F-statistic: 502.8
Date: pon, 01 sty 2018 Prob (F-statistic): 6.33e-85
Time: 11:37:56 Log-Likelihood: 75.101
No. Observations: 724 AIC: -146.2
Df Residuals: 722 BIC: -137.0
Df Model: 1
Covariance Type: nonrobust
================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------
Intercept 3.4801 0.018 198.438 0.000 3.446 3.515
np.log(size) 0.3121 0.014 22.424 0.000 0.285 0.339
==============================================================================
Omnibus: 10.804 Durbin-Watson: 0.813
Prob(Omnibus): 0.005 Jarque-Bera (JB): 9.907
Skew: -0.235 Prob(JB): 0.00706
Kurtosis: 2.674 Cond. No. 4.20
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print('logLik:',model_gen_2.llf)
logLik: 75.1013077561
model_gen_3 = smf.ols('GC ~ temperature', data=genomes, missing='drop').fit()
print(model_gen_3.summary())
OLS Regression Results
==============================================================================
Dep. Variable: GC R-squared: 0.009
Model: OLS Adj. R-squared: 0.006
Method: Least Squares F-statistic: 3.597
Date: pon, 01 sty 2018 Prob (F-statistic): 0.0586
Time: 11:37:56 Log-Likelihood: -1600.4
No. Observations: 402 AIC: 3205.
Df Residuals: 400 BIC: 3213.
Df Model: 1
Covariance Type: nonrobust
===============================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------
Intercept 51.4516 1.667 30.867 0.000 48.175 54.729
temperature -0.0759 0.040 -1.896 0.059 -0.155 0.003
==============================================================================
Omnibus: 201.221 Durbin-Watson: 0.813
Prob(Omnibus): 0.000 Jarque-Bera (JB): 23.772
Skew: 0.104 Prob(JB): 6.89e-06
Kurtosis: 1.827 Cond. No. 107.
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Rozdział 2.1.4
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('anscombe')
anscombe = r['anscombe']
print(anscombe.dtypes)
x1 float64
x2 float64
x3 float64
x4 float64
y1 float64
y2 float64
y3 float64
y4 float64
dtype: object
m1 = smf.ols('y1 ~ x1', data=anscombe, missing='drop').fit().summary().tables[1]
m2 = smf.ols('y2 ~ x2', data=anscombe, missing='drop').fit().summary().tables[1]
m3 = smf.ols('y3 ~ x3', data=anscombe, missing='drop').fit().summary().tables[1]
m4 = smf.ols('y4 ~ x4', data=anscombe, missing='drop').fit().summary().tables[1]
print(m1)
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 3.0001 1.125 2.667 0.026 0.456 5.544
x1 0.5001 0.118 4.241 0.002 0.233 0.767
==============================================================================
print(m2)
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 3.0009 1.125 2.667 0.026 0.455 5.547
x2 0.5000 0.118 4.239 0.002 0.233 0.767
==============================================================================
print(m3)
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 3.0025 1.124 2.670 0.026 0.459 5.546
x3 0.4997 0.118 4.239 0.002 0.233 0.766
==============================================================================
print(m4)
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 3.0017 1.124 2.671 0.026 0.459 5.544
x4 0.4999 0.118 4.243 0.002 0.233 0.766
==============================================================================
Rozdział 2.1.4.6
# fitted values:
fitted = model_gen_2.fittedvalues
# residuals:
resid = model_gen_2.resid
# normalized residuals:
norm_resid = model_gen_2.get_influence().resid_studentized_internal
# absolute squared normalized residuals:
norm_resid_abs_sqrt = np.sqrt(np.abs(norm_resid))
# leverage:
leverage = model_gen_2.get_influence().hat_matrix_diag
# cook's distance:
cooks = model_gen_2.get_influence().cooks_distance[0]
from statsmodels.graphics.regressionplots import *
fig = plt.figure(figsize=(10,22))
ax1 = fig.add_subplot(621)
ax2 = fig.add_subplot(622)
ax3 = fig.add_subplot(623)
ax4 = fig.add_subplot(624)
ax5 = fig.add_subplot(625)
ax6 = fig.add_subplot(626)
sns.residplot(fitted, resid,
lowess=True, line_kws={'color': 'C1'}, scatter_kws={'alpha':'0.3'},ax=ax1)
ax1.set_xlabel('fitted'); ax1.set_ylabel('resid')
sm.qqplot(resid, line='q',ax=ax2)
sns.residplot(fitted, norm_resid_abs_sqrt,
lowess=True, line_kws={'color': 'C1'}, scatter_kws={'color':'C0','alpha':'0.3'},ax=ax3)
ax3.set_xlabel('fitted'); ax3.set_ylabel('norm_resid_abs_sqrt')
ax4.stem(np.arange(len(cooks)), cooks, markerfmt=",")
ax4.set_xlabel('obs. number'); ax4.set_ylabel('cooks')
sns.residplot(leverage, norm_resid,
lowess=True, line_kws={'color': 'C1'}, scatter_kws={'color':'C0','alpha':'0.3'},ax=ax5)
ax5.set_xlabel('leverage'); ax5.set_ylabel('norm resid')
ax5.set_xlim(0,0.024)
sns.regplot(leverage, cooks,
fit_reg=False, line_kws={'color': 'C1'}, scatter_kws={'color':'C0','alpha':'0.3'},ax=ax6)
ax6.set_xlabel('leverage'); ax6.set_ylabel('cooks')
ax6.set_xlim(0,0.018)
fig.tight_layout()
Rozdział 2.1.4.7
import statsmodels.stats.api as smf
LM, LM_p, F, F_p = smf.het_breushpagan(model_gen_2.resid, model_gen_2.model.exog)
print('Breush-Pagan, LM: ', LM, ', p-value: ', LM_p)
Breush-Pagan, LM: 0.179462084257 , p-value: 0.671835924893
import statsmodels.stats.api as smf
sol = smf.het_goldfeldquandt.run(model_gen_2.resid, model_gen_2.model.exog,idx=1,split=0.5)
print('Goldfeld-Quandt, LM: ', sol[0], ', p-value: ', sol[1])
Goldfeld-Quandt, LM: 1.07019343064 , p-value: 0.260088445225
import statsmodels.stats.api as smf
LM, LM_p, F, F_p = smf.het_white(model_gen_2.resid, model_gen_2.model.exog)
print('White, LM: ', LM, ', p-value: ', LM_p)
White, LM: 15.5778419132 , p-value: 0.000414299690222
sms.acorr_breusch_godfrey(model_gen_2, nlags=3)
(262.32198902599026,
1.4141890343091007e-56,
137.92362856701914,
1.4533384978416521e-70)
sms.linear_harvey_collier(model_gen_2)
Ttest_1sampResult(statistic=1.4514201304774603, pvalue=0.14709846769513255)
sms.linear_rainbow(model_gen_2)
(1.5842363541237348, 6.8724511079041082e-06)
from statsmodels.stats.outliers_influence import reset_ramsey
reset_ramsey(model_gen_2, degree=3)
<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[ 1.3563947]]), p=0.25824518482657094, df_denom=720, df_num=2>
from scipy import stats
stats.shapiro(resid)
(0.9905691742897034, 0.00013696221867576241)
Rozdział 2.1.5
tGC, wsp = stats.boxcox(genomes['GC'])
wsp
0.44762247282537276
import statsmodels.formula.api as smf
model_gen_4 = smf.ols('tGC ~ np.log(size)', data=genomes, missing='drop').fit()
print(model_gen_4.summary())
OLS Regression Results
==============================================================================
Dep. Variable: tGC R-squared: 0.405
Model: OLS Adj. R-squared: 0.404
Method: Least Squares F-statistic: 491.2
Date: pon, 01 sty 2018 Prob (F-statistic): 1.99e-83
Time: 11:37:58 Log-Likelihood: -1163.9
No. Observations: 724 AIC: 2332.
Df Residuals: 722 BIC: 2341.
Df Model: 1
Covariance Type: nonrobust
================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------
Intercept 8.3570 0.097 86.077 0.000 8.166 8.548
np.log(size) 1.7079 0.077 22.163 0.000 1.557 1.859
==============================================================================
Omnibus: 10.852 Durbin-Watson: 0.826
Prob(Omnibus): 0.004 Jarque-Bera (JB): 7.816
Skew: -0.135 Prob(JB): 0.0201
Kurtosis: 2.568 Cond. No. 4.20
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Rozdział 2.2.2
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('AML')
AML = r['AML']
AML = AML.rename(columns=lambda x: x.replace('.', '_'))
print(AML.head(2))
Mutation CD14_control CD14_D3 CD14_1906 CD14_2191
1 CBFbeta 66.36 75.94 80.18 71.87
2 CBFbeta 56.03 73.40 59.04 79.51
AML['change_2191'] = AML['CD14_2191'] - AML['CD14_control']
AML.groupby(['Mutation'])['change_2191'].mean()
Mutation
CBFbeta 7.814000
FLT3 0.827143
None -6.680556
Other 1.753158
Name: change_2191, dtype: float64
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
fig = plt.figure(figsize=(7,6))
ax = fig.add_subplot(1,1,1)
ax = sns.boxplot(x="Mutation", y="change_2191",color='C0',linewidth=0.5,showmeans=True,
order=np.sort(AML['Mutation'].unique()),data=AML)
plt.axhline(y=0, color='C1', linestyle='--', linewidth=1)
fig.tight_layout()
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
a1 = smf.ols('CD14_2191 ~ Mutation',data=AML).fit()
aov_table = sms.anova_lm(a1, typ=2)
print(aov_table)
sum_sq df F PR(>F)
Mutation 9404.202478 3.0 6.136347 0.00101
Residual 31672.512421 62.0 NaN NaN
from patsy import dmatrices
y, X = dmatrices("change_2191 ~ Mutation", AML,return_type = 'dataframe')
print(X[:6])
Intercept Mutation[T.FLT3] Mutation[T.None] Mutation[T.Other]
1 1.0 0.0 0.0 0.0
2 1.0 0.0 0.0 0.0
3 1.0 0.0 0.0 0.0
4 1.0 0.0 0.0 0.0
5 1.0 0.0 0.0 0.0
6 1.0 0.0 0.0 0.0
print(aov_table['PR(>F)'][0])
0.00100968613128
import scikit_posthocs as sp
print(
sp.posthoc_ttest(AML, val_col='CD14_2191', group_col='Mutation',
pool_sd = True, equal_var = False, p_adjust = 'holm')
)
CBFbeta FLT3 None Other
CBFbeta -1.000000 0.120078 0.006005 0.959835
FLT3 0.120078 -1.000000 0.514455 0.108687
None 0.006005 0.514455 -1.000000 0.003119
Other 0.959835 0.108687 0.003119 -1.000000
stats.levene(*[x[1] for x in AML.groupby(['Mutation'])['CD14_2191']])
LeveneResult(statistic=0.49548465123111163, pvalue=0.6867414670221561)
stats.shapiro(a1.resid)
(0.9803504347801208, 0.37868833541870117)
from patsy import dmatrices
y, X = dmatrices("change_2191 ~ Mutation-1", AML,return_type = 'dataframe')
print(X[:6])
Mutation[CBFbeta] Mutation[FLT3] Mutation[None] Mutation[Other]
14 1.0 0.0 0.0 0.0
9 1.0 0.0 0.0 0.0
8 1.0 0.0 0.0 0.0
10 1.0 0.0 0.0 0.0
6 1.0 0.0 0.0 0.0
7 1.0 0.0 0.0 0.0
a2 = smf.ols('change_2191 ~ Mutation-1',data=AML).fit()
aov_table2 = sm.stats.anova_lm(a2, typ=2)
print(aov_table2)
sum_sq df F PR(>F)
Mutation 1787.191749 4.0 3.42241 0.013615
Residual 8094.140751 62.0 NaN NaN
model = smf.ols('change_2191 ~ Mutation-1', data=AML, missing='drop').fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: change_2191 R-squared: 0.179
Model: OLS Adj. R-squared: 0.139
Method: Least Squares F-statistic: 4.495
Date: pon, 01 sty 2018 Prob (F-statistic): 0.00643
Time: 11:37:59 Log-Likelihood: -252.35
No. Observations: 66 AIC: 512.7
Df Residuals: 62 BIC: 521.5
Df Model: 3
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
Mutation[CBFbeta] 7.8140 2.950 2.649 0.010 1.917 13.711
Mutation[FLT3] 0.8271 3.054 0.271 0.787 -5.277 6.931
Mutation[None] -6.6806 2.693 -2.481 0.016 -12.064 -1.297
Mutation[Other] 1.7532 2.621 0.669 0.506 -3.487 6.993
==============================================================================
Omnibus: 1.494 Durbin-Watson: 2.164
Prob(Omnibus): 0.474 Jarque-Bera (JB): 1.234
Skew: -0.138 Prob(JB): 0.539
Kurtosis: 2.390 Cond. No. 1.16
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from statsmodels.sandbox.stats.multicomp import *
_,holm,_,_ = multipletests(np.array(model.pvalues),method='holm')
holm
array([ 0.04094528, 1. , 0.0475308 , 1. ])
Rozdział 2.2.3
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('vaccination')
vaccination = r['vaccination']
print(vaccination.head(2))
response dose
1 88.9 0 ml
2 105.0 0 ml
print(vaccination.describe(include='all'))
response dose
count 100.000000 100
unique NaN 5
top NaN 10 ml
freq NaN 20
mean 97.890000 NaN
std 27.752097 NaN
min 39.500000 NaN
25% 77.300000 NaN
50% 99.250000 NaN
75% 117.700000 NaN
max 154.700000 NaN
vaccination.groupby(['dose'])['response'].mean()
dose
0 ml 108.570
10 ml 119.265
20 ml 84.025
30 ml 92.370
40 ml 85.220
Name: response, dtype: float64
fig = plt.figure(figsize=(7,6))
ax = fig.add_subplot(1,1,1)
ax = sns.boxplot(x="dose", y="response",color='C0',linewidth=0.5,showmeans=True,
order=np.sort(vaccination['dose'].unique()),data=vaccination)
fig.tight_layout()
a3 = smf.ols('response ~ dose',data=vaccination).fit()
aov_table3 = sm.stats.anova_lm(a3, typ=2)
print(aov_table3)
sum_sq df F PR(>F)
dose 19083.811 4.0 7.928789 0.000015
Residual 57163.899 95.0 NaN NaN
Rozdział 2.2.4
from statsmodels.stats.multicomp import *
tukey = pairwise_tukeyhsd(vaccination['response'], vaccination['dose'])
print(tukey)
Multiple Comparison of Means - Tukey HSD,FWER=0.05
===============================================
group1 group2 meandiff lower upper reject
-----------------------------------------------
0 ml 10 ml 10.695 -10.8767 32.2667 False
0 ml 20 ml -24.545 -46.1167 -2.9733 True
0 ml 30 ml -16.2 -37.7717 5.3717 False
0 ml 40 ml -23.35 -44.9217 -1.7783 True
10 ml 20 ml -35.24 -56.8117 -13.6683 True
10 ml 30 ml -26.895 -48.4667 -5.3233 True
10 ml 40 ml -34.045 -55.6167 -12.4733 True
20 ml 30 ml 8.345 -13.2267 29.9167 False
20 ml 40 ml 1.195 -20.3767 22.7667 False
30 ml 40 ml -7.15 -28.7217 14.4217 False
-----------------------------------------------
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
tukey.plot_simultaneous(ax=ax,figsize=(7, 6))
ax.vlines(x=0,ymin=-0.5,ymax=4.5, color="red")
fig.tight_layout()
import scikit_posthocs as sp
print(
sp.posthoc_ttest(vaccination,val_col = 'response', group_col = 'dose',
pool_sd = True, equal_var = False, p_adjust = 'holm')
)
0 ml 10 ml 20 ml 30 ml 40 ml
0 ml -1.000000 0.684854 0.014625 0.197178 0.020066
10 ml 0.684854 -1.000000 0.000163 0.006334 0.000266
20 ml 0.014625 0.000163 -1.000000 0.854238 0.877895
30 ml 0.197178 0.006334 0.854238 -1.000000 0.854238
40 ml 0.020066 0.000266 0.877895 0.854238 -1.000000
w pakiecie STAMP (dla wersji Pythona 2.x.) są dostępne takie testy jak: Games-Howell i Scheffe.
w pakiecie pyvttbl (dla wersji Pythona 2.x.) są dostępne takie testy jak: SNK, HSD
Rozdział 2.2.5
# test Bartletta:
stats.bartlett(*[x[1] for x in vaccination.groupby(['dose'])['response']])
BartlettResult(statistic=5.6387376714867843, pvalue=0.22780081437451941)
# test Flignera:
stats.fligner(*[x[1] for x in vaccination.groupby(['dose'])['response']])
FlignerResult(statistic=4.8066091744172104, pvalue=0.30772224950107929)
# test Levene'a:
stats.levene(*[x[1] for x in vaccination.groupby(['dose'])['response']])
LeveneResult(statistic=1.367892839401103, pvalue=0.25091487133507662)
# test Flignera dla średniej:
stats.fligner(*[x[1] for x in vaccination.groupby(['dose'])['response']],center="mean")
FlignerResult(statistic=5.7187555428637866, pvalue=0.22115934922712349)
# test Levene'a dla średniej:
stats.levene(*[x[1] for x in vaccination.groupby(['dose'])['response']],center="mean")
LeveneResult(statistic=1.6203410035862478, pvalue=0.17547959058356802)
Rozdział 2.2.6
from patsy.contrasts import Treatment
contrast = Treatment().code_without_intercept([1,2,3,4,5])
print(contrast.matrix.T)
[[ 0. 1. 0. 0. 0.]
[ 0. 0. 1. 0. 0.]
[ 0. 0. 0. 1. 0.]
[ 0. 0. 0. 0. 1.]]
from patsy.contrasts import Helmert
contrast = Helmert().code_without_intercept([1,2,3,4,5])
print(contrast.matrix.T)
[[-1. 1. 0. 0. 0.]
[-1. -1. 2. 0. 0.]
[-1. -1. -1. 3. 0.]
[-1. -1. -1. -1. 4.]]
from patsy.contrasts import Poly
contrast = Poly().code_without_intercept([1,2,3,4,5])
print(contrast.matrix.T)
[[ -6.32455532e-01 -3.16227766e-01 2.77555756e-17 3.16227766e-01
6.32455532e-01]
[ 5.34522484e-01 -2.67261242e-01 -5.34522484e-01 -2.67261242e-01
5.34522484e-01]
[ -3.16227766e-01 6.32455532e-01 2.49800181e-16 -6.32455532e-01
3.16227766e-01]
[ 1.19522861e-01 -4.78091444e-01 7.17137166e-01 -4.78091444e-01
1.19522861e-01]]
from patsy.contrasts import Diff
contrast = Diff().code_without_intercept([1,2,3,4,5])
print(contrast.matrix.T)
[[-0.8 0.2 0.2 0.2 0.2]
[-0.6 -0.6 0.4 0.4 0.4]
[-0.4 -0.4 -0.4 0.6 0.6]
[-0.2 -0.2 -0.2 -0.2 0.8]]
from patsy import dmatrices
y, X = dmatrices("response ~ dose", vaccination,return_type = 'dataframe')
print(X[:6])
Intercept dose[T.10 ml] dose[T.20 ml] dose[T.30 ml] dose[T.40 ml]
20 1.0 0.0 0.0 0.0 0.0
6 1.0 0.0 0.0 0.0 0.0
1 1.0 0.0 0.0 0.0 0.0
14 1.0 0.0 0.0 0.0 0.0
19 1.0 0.0 0.0 0.0 0.0
8 1.0 0.0 0.0 0.0 0.0
model01 = smf.ols('response ~ dose', data=vaccination, missing='drop').fit()
print(model01.summary())
OLS Regression Results
==============================================================================
Dep. Variable: response R-squared: 0.250
Model: OLS Adj. R-squared: 0.219
Method: Least Squares F-statistic: 7.929
Date: pon, 01 sty 2018 Prob (F-statistic): 1.47e-05
Time: 11:37:59 Log-Likelihood: -459.32
No. Observations: 100 AIC: 928.6
Df Residuals: 95 BIC: 941.7
Df Model: 4
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
Intercept 108.5700 5.485 19.794 0.000 97.681 119.459
dose[T.10 ml] 10.6950 7.757 1.379 0.171 -4.705 26.095
dose[T.20 ml] -24.5450 7.757 -3.164 0.002 -39.945 -9.145
dose[T.30 ml] -16.2000 7.757 -2.088 0.039 -31.600 -0.800
dose[T.40 ml] -23.3500 7.757 -3.010 0.003 -38.750 -7.950
==============================================================================
Omnibus: 0.124 Durbin-Watson: 0.547
Prob(Omnibus): 0.940 Jarque-Bera (JB): 0.297
Skew: 0.037 Prob(JB): 0.862
Kurtosis: 2.743 Cond. No. 5.83
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
kontr = Diff().code_without_intercept([1,2,3,4,5])
model02 = smf.ols('response ~ C(dose, kontr)', data=vaccination, missing='drop').fit()
print(model02.summary())
OLS Regression Results
==============================================================================
Dep. Variable: response R-squared: 0.250
Model: OLS Adj. R-squared: 0.219
Method: Least Squares F-statistic: 7.929
Date: pon, 01 sty 2018 Prob (F-statistic): 1.47e-05
Time: 11:37:59 Log-Likelihood: -459.32
No. Observations: 100 AIC: 928.6
Df Residuals: 95 BIC: 941.7
Df Model: 4
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
Intercept 97.8900 2.453 39.906 0.000 93.020 102.760
C(dose, kontr)[D.1] 10.6950 7.757 1.379 0.171 -4.705 26.095
C(dose, kontr)[D.2] -35.2400 7.757 -4.543 0.000 -50.640 -19.840
C(dose, kontr)[D.3] 8.3450 7.757 1.076 0.285 -7.055 23.745
C(dose, kontr)[D.4] -7.1500 7.757 -0.922 0.359 -22.550 8.250
==============================================================================
Omnibus: 0.124 Durbin-Watson: 0.547
Prob(Omnibus): 0.940 Jarque-Bera (JB): 0.297
Skew: 0.037 Prob(JB): 0.862
Kurtosis: 2.743 Cond. No. 4.25
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
kontrP = Poly().code_without_intercept([1,2,3,4,5])
model_va_7 = smf.ols('response ~ C(dose, kontrP)', data=vaccination, missing='drop').fit()
print(model_va_7.summary())
OLS Regression Results
==============================================================================
Dep. Variable: response R-squared: 0.250
Model: OLS Adj. R-squared: 0.219
Method: Least Squares F-statistic: 7.929
Date: pon, 01 sty 2018 Prob (F-statistic): 1.47e-05
Time: 11:37:59 Log-Likelihood: -459.32
No. Observations: 100 AIC: 928.6
Df Residuals: 95 BIC: 941.7
Df Model: 4
Covariance Type: nonrobust
=============================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------
Intercept 97.8900 2.453 39.906 0.000 93.020 102.760
C(dose, kontrP).Linear -23.2728 5.485 -4.243 0.000 -34.162 -12.383
C(dose, kontrP).Quadratic 2.1100 5.485 0.385 0.701 -8.779 12.999
C(dose, kontrP).Cubic 9.6260 5.485 1.755 0.082 -1.263 20.515
C(dose, kontrP)^4 -17.7611 5.485 -3.238 0.002 -28.650 -6.872
==============================================================================
Omnibus: 0.124 Durbin-Watson: 0.547
Prob(Omnibus): 0.940 Jarque-Bera (JB): 0.297
Skew: 0.037 Prob(JB): 0.862
Kurtosis: 2.743 Cond. No. 2.24
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from patsy.contrasts import *
levels = [1,2,3,4,5]
contrast01 = Helmert().code_without_intercept(levels)
contrast02 = Poly().code_without_intercept(levels)
contrast04 = Sum().code_without_intercept(levels)
contrast05 = Treatment().code_without_intercept(levels)
contrast06 = Diff().code_without_intercept(levels)
c = [contrast01,contrast02,contrast04,contrast05,contrast06]
n = [smf.ols('response ~ C(dose, i)', data=vaccination, missing='drop').fit() for i in c]
T = pd.DataFrame()
T['Helmert'] = n[0].params.values
T['Poly'] = n[1].params.values
T['Sum'] = n[2].params.values
T['Treatment'] = n[3].params.values
T['Diff'] = n[4].params.values
print(np.transpose(T))
0 1 2 3 4
Helmert 97.89 5.347500 -9.964167 -2.895833 -3.167500
Poly 97.89 -23.272782 2.110028 9.625973 -17.761097
Sum 97.89 10.680000 21.375000 -13.865000 -5.520000
Treatment 108.57 10.695000 -24.545000 -16.200000 -23.350000
Diff 97.89 10.695000 -35.240000 8.345000 -7.150000
Rozdział 2.3.2
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('schizophrenia')
schizophrenia = r['schizophrenia']
schizophrenia = schizophrenia.rename(columns=lambda x: x.replace('.', '_'))
print(schizophrenia.head(2))
NfkB CD28 IFN Dikeos_manic Dikeos_reality_distortion \
1 AG TT TT+AA 3.0 19.0
2 AG TC+CC AT 5.0 29.0
Dikeos_depression Dikeos_disorganization Dikeos_negative Dikeos_sum
1 12.0 42.0 2.0 78.0
2 16.0 46.0 0.0 96.0
import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(10,8))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
sns.factorplot(x="CD28", y="Dikeos_sum", hue="NfkB",ci=None,data=schizophrenia, palette="Set2",order=["TC+CC","TT"],ax=ax1)
sns.factorplot(x="NfkB", y="Dikeos_sum", hue="CD28",ci=None,data=schizophrenia, palette="Set2",order=["AA","AG","GG"],ax=ax2)
sns.factorplot(x="IFN", y="Dikeos_sum", hue="NfkB",ci=None,data=schizophrenia, palette="Set2",order=["AT","TT+AA"],ax=ax3)
sns.factorplot(x="NfkB", y="Dikeos_sum", hue="IFN",ci=None,data=schizophrenia, palette="Set2",order=["AA","AG","GG"],ax=ax4)
fig.tight_layout()
modelNCbezI = smf.ols('Dikeos_sum ~ NfkB+CD28', data=schizophrenia, missing='drop').fit()
modelNCzI = smf.ols('Dikeos_sum ~ NfkB*CD28', data=schizophrenia, missing='drop').fit()
print(sms.anova_lm(modelNCbezI,modelNCzI))
df_resid ssr df_diff ss_diff F Pr(>F)
0 94.0 20715.300537 0.0 NaN NaN NaN
1 92.0 15040.258968 2.0 5675.041568 17.356876 4.021124e-07
print(schizophrenia.iloc[:,[0,1,2]].head(6))
NfkB CD28 IFN
1 AG TT TT+AA
2 AG TC+CC AT
3 GG TT AT
4 AG TT TT+AA
5 GG TT AT
6 GG TT AT
from patsy import dmatrices
y, X = dmatrices("Dikeos_sum ~ NfkB+CD28", schizophrenia,return_type = 'dataframe')
print(X[:6])
Intercept NfkB[T.AG] NfkB[T.GG] CD28[T.TT]
1 1.0 1.0 0.0 1.0
2 1.0 1.0 0.0 0.0
3 1.0 0.0 1.0 1.0
4 1.0 1.0 0.0 1.0
5 1.0 0.0 1.0 1.0
6 1.0 0.0 1.0 1.0
from patsy import dmatrices
y, X = dmatrices("Dikeos_sum ~ NfkB*CD28", schizophrenia, return_type = 'dataframe')
print(X[:6])
Intercept NfkB[T.AG] NfkB[T.GG] CD28[T.TT] NfkB[T.AG]:CD28[T.TT] \
1 1.0 1.0 0.0 1.0 1.0
2 1.0 1.0 0.0 0.0 0.0
3 1.0 0.0 1.0 1.0 0.0
4 1.0 1.0 0.0 1.0 1.0
5 1.0 0.0 1.0 1.0 0.0
6 1.0 0.0 1.0 1.0 0.0
NfkB[T.GG]:CD28[T.TT]
1 0.0
2 0.0
3 1.0
4 0.0
5 1.0
6 1.0
Rozdział 2.3.3.1
aov1 = smf.ols('Dikeos_sum ~ NfkB*CD28',data=schizophrenia).fit()
print(sm.stats.anova_lm(aov1, typ=1))
df sum_sq mean_sq F PR(>F)
NfkB 2.0 21119.448986 10559.724493 64.592947 2.988122e-18
CD28 1.0 4.525988 4.525988 0.027685 8.682165e-01
NfkB:CD28 2.0 5675.041568 2837.520784 17.356876 4.021124e-07
Residual 92.0 15040.258968 163.481076 NaN NaN
aov2 = smf.ols('Dikeos_sum ~ CD28*NfkB',data=schizophrenia).fit()
print(sm.stats.anova_lm(aov2, typ=1))
df sum_sq mean_sq F PR(>F)
CD28 1.0 108.012107 108.012107 0.660701 4.184111e-01
NfkB 2.0 21015.962867 10507.981433 64.276439 3.409216e-18
CD28:NfkB 2.0 5675.041568 2837.520784 17.356876 4.021124e-07
Residual 92.0 15040.258968 163.481076 NaN NaN
aov3 = smf.ols('Dikeos_sum ~ CD28*NfkB',data=schizophrenia).fit()
print(aov3.summary().tables[1])
=========================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------
Intercept 76.7059 3.101 24.735 0.000 70.547 82.865
CD28[T.TT] -12.2559 4.218 -2.906 0.005 -20.633 -3.879
NfkB[T.AG] 22.0084 4.615 4.769 0.000 12.844 31.173
NfkB[T.GG] -43.7059 9.558 -4.573 0.000 -62.689 -24.723
CD28[T.TT]:NfkB[T.AG] 16.0933 5.925 2.716 0.008 4.326 27.861
CD28[T.TT]:NfkB[T.GG] 60.6309 10.476 5.788 0.000 39.824 81.437
=========================================================================================
from statsmodels.stats.multicomp import *
tukey_NfkB = pairwise_tukeyhsd(schizophrenia['Dikeos_sum'], schizophrenia['NfkB'])
print(tukey_NfkB)
Multiple Comparison of Means - Tukey HSD,FWER=0.05
===============================================
group1 group2 meandiff lower upper reject
-----------------------------------------------
AA AG 31.2212 23.3359 39.1066 True
AA GG 5.9189 -4.1865 16.0244 False
AG GG -25.3023 -35.1743 -15.4303 True
-----------------------------------------------
tukey_CD28 = pairwise_tukeyhsd(schizophrenia['Dikeos_sum'], schizophrenia['CD28'])
print(tukey_CD28)
Multiple Comparison of Means - Tukey HSD,FWER=0.05
=============================================
group1 group2 meandiff lower upper reject
---------------------------------------------
TC+CC TT 2.2214 -6.6247 11.0675 False
---------------------------------------------
tukey_NfkB_CD28 = pairwise_tukeyhsd(schizophrenia['Dikeos_sum'], schizophrenia['CD28']+ ':' + schizophrenia['NfkB'])
print(tukey_NfkB_CD28)
Multiple Comparison of Means - Tukey HSD,FWER=0.05
===================================================
group1 group2 meandiff lower upper reject
---------------------------------------------------
TC+CC:AA TC+CC:AG 22.0084 8.5774 35.4394 True
TC+CC:AA TC+CC:GG -43.7059 -71.5256 -15.8862 True
TC+CC:AA TT:AA -12.2559 -24.5324 0.0207 False
TC+CC:AA TT:AG 25.8458 14.4782 37.2135 True
TC+CC:AA TT:GG 4.6691 -8.2934 17.6316 False
TC+CC:AG TC+CC:GG -65.7143 -93.846 -37.5826 True
TC+CC:AG TT:AA -34.2643 -47.2324 -21.2962 True
TC+CC:AG TT:AG 3.8374 -8.2737 15.9486 False
TC+CC:AG TT:GG -17.3393 -30.9585 -3.7201 True
TC+CC:GG TT:AA 31.45 3.8508 59.0492 True
TC+CC:GG TT:AG 69.5517 42.3446 96.7588 True
TC+CC:GG TT:GG 48.375 20.4639 76.2861 True
TT:AA TT:AG 38.1017 27.2849 48.9185 True
TT:AA TT:GG 16.925 4.4428 29.4072 True
TT:AG TT:GG -21.1767 -32.7662 -9.5873 True
---------------------------------------------------
schizophrenia2 = schizophrenia
schizophrenia2['CD28_NfkB'] = schizophrenia['CD28']+ ':' + schizophrenia['NfkB']
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
sns.pointplot(x="CD28", y="Dikeos_sum", data=schizophrenia, join=False, ax=ax1)
sns.pointplot(x="NfkB", y="Dikeos_sum", data=schizophrenia, join=False, ax=ax2)
sns.pointplot(x="CD28_NfkB", y="Dikeos_sum", data=schizophrenia2, join=False, ax=ax3)
fig.tight_layout()
table_NfkB = pd.pivot_table(schizophrenia, values='Dikeos_sum',
columns=['NfkB'], aggfunc=[np.mean,len])
print(table_NfkB)
mean len
NfkB AA AG GG AA AG GG
Dikeos_sum 70.081081 101.302326 76.0 37.0 43.0 18.0
table_CD28 = pd.pivot_table(schizophrenia, values='Dikeos_sum',
columns=['CD28'], aggfunc=[np.mean,len])
print(table_CD28)
mean len
CD28 TC+CC TT TC+CC TT
Dikeos_sum 83.393939 85.615385 33.0 65.0
table_NfkB_CD28 = pd.pivot_table(schizophrenia, values='Dikeos_sum', index=['NfkB'],
columns=['CD28'], aggfunc=[np.mean,len])
print(table_NfkB_CD28)
mean len
CD28 TC+CC TT TC+CC TT
NfkB
AA 76.705882 64.450000 17.0 20.0
AG 98.714286 102.551724 14.0 29.0
GG 33.000000 81.375000 2.0 16.0
a1 = smf.ols('Dikeos_sum ~ (NfkB+CD28+IFN)**2',data=schizophrenia).fit()
print(sms.anova_lm(a1, typ=1))
df sum_sq mean_sq F PR(>F)
NfkB 2.0 21119.448986 10559.724493 64.029638 6.855345e-18
CD28 1.0 4.525988 4.525988 0.027444 8.688035e-01
IFN 1.0 151.189042 151.189042 0.916745 3.409541e-01
NfkB:CD28 2.0 6039.797669 3019.898835 18.311371 2.243657e-07
NfkB:IFN 2.0 10.618705 5.309352 0.032194 9.683305e-01
CD28:IFN 1.0 0.794700 0.794700 0.004819 9.448152e-01
Residual 88.0 14512.900420 164.919323 NaN NaN
a2 = smf.ols('Dikeos_sum ~ NfkB*CD28*IFN',data=schizophrenia).fit()
print(sms.anova_lm(a2, typ=1))
df sum_sq mean_sq F PR(>F)
NfkB 2.0 21119.448986 10559.724493 63.560260 1.128618e-17
CD28 1.0 4.525988 4.525988 0.027242 8.692897e-01
IFN 1.0 151.189042 151.189042 0.910025 3.427814e-01
NfkB:CD28 2.0 6039.797669 3019.898835 18.177137 2.605015e-07
NfkB:IFN 2.0 10.618705 5.309352 0.031958 9.685591e-01
CD28:IFN 1.0 0.794700 0.794700 0.004783 9.450211e-01
NfkB:CD28:IFN 2.0 225.100023 112.550012 0.677452 5.105989e-01
Residual 86.0 14287.800397 166.137214 NaN NaN
Rozdział 2.4.2
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('ecap')
ecap = r['ecap']
print(ecap.head(2))
city district sex weight height house.surface PNIF age allergenes
1 2 Zamosc 2 67.0 174.0 53 40 23.0 6
2 2 Gorzkow 1 55.0 159.0 70 70 23.0 0
import rpy2.robjects as robjects
model_d_m = robjects.r("model_dzielnica_miasto <- aov(PNIF~city/district,data=ecap); summary(model_dzielnica_miasto)")
print(model_d_m)
Df Sum Sq Mean Sq F value Pr(>F)
city 8 929717 116215 57.663 <2e-16 ***
city:district 81 162376 2005 0.995 0.494
Residuals 2012 4054978 2015
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
fig = plt.figure(figsize=(7,10))
ax = fig.add_subplot(1,1,1)
ax = sns.boxplot(x="PNIF", y="district",color='C0',linewidth=0.5,showmeans=True,
order=np.sort(ecap['district'].unique()),
data=ecap)
fig.tight_layout()
import rpy2.robjects as robjects
m_model = robjects.r("mmodelu <- model.matrix(model_dzielnica_miasto)")
print(robjects.r("dim(mmodelu)"))
[2102 657]
import rpy2.robjects as robjects
model_d_m_w_p = robjects.r("model_dzielnica_miasto_wiek_plec <- aov(PNIF~city/district+age+sex+height,data=ecap); summary(model_dzielnica_miasto_wiek_plec)")
print(model_d_m_w_p)
Df Sum Sq Mean Sq F value Pr(>F)
city 8 926088 115761 63.354 <2e-16 ***
age 1 1890 1890 1.034 0.309
sex 1 361392 361392 197.784 <2e-16 ***
height 1 3385 3385 1.853 0.174
city:district 81 159806 1973 1.080 0.297
Residuals 1996 3647098 1827
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
13 observations deleted due to missingness
import rpy2.robjects as robjects
model_d_m_w_p2 = robjects.r("model_dzielnica_miasto_wiek_plec2 <- aov(PNIF~city/district+height+age+sex,data=ecap); summary(model_dzielnica_miasto_wiek_plec2)")
print(model_d_m_w_p2)
Df Sum Sq Mean Sq F value Pr(>F)
city 8 926088 115761 63.354 <2e-16 ***
height 1 232334 232334 127.153 <2e-16 ***
age 1 52 52 0.028 0.866
sex 1 134281 134281 73.490 <2e-16 ***
city:district 81 159806 1973 1.080 0.297
Residuals 1996 3647098 1827
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
13 observations deleted due to missingness
Rozdział 2.5.2
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('endometriosis')
endometriosis = r['endometriosis']
print(endometriosis.head(2))
disease phase alpha.factor beta.factor
1 endometrioza folikularna 14.1 4.1
2 endometrioza folikularna 0.4 6.4
print(endometriosis.describe(include='all'))
disease phase alpha.factor beta.factor
count 165 165 165.000000 165.000000
unique 2 2 NaN NaN
top endometrioza folikularna NaN NaN
freq 105 111 NaN NaN
mean NaN NaN 5.775152 9.521212
std NaN NaN 7.956608 10.932196
min NaN NaN 0.400000 0.500000
25% NaN NaN 1.200000 2.400000
50% NaN NaN 1.900000 5.300000
75% NaN NaN 7.200000 12.800000
max NaN NaN 50.100000 62.900000
endometriosis['log_alpha_factor'] = np.log(endometriosis['alpha.factor'])
endometriosis['log_beta_factor'] = np.log(endometriosis['beta.factor'])
g = sns.FacetGrid(endometriosis, col="disease", row="phase",margin_titles=True,size=4, aspect=1)
g = (g.map(sns.regplot, "log_beta_factor", "log_alpha_factor"))
endometriosis['CF'] = endometriosis['disease']+ ' ' + endometriosis['phase']
cf = endometriosis['CF'].unique().tolist()
cf
['endometrioza folikularna',
'endometrioza lutealna',
'zdrowa folikularna',
'zdrowa lutealna']
md = [smf.ols("log_alpha_factor ~ log_beta_factor+CF", data=endometriosis,subset = endometriosis['CF']==i).fit() for i in cf]
coef = [md[i].params for i in range(len(cf))]
print(pd.DataFrame(coef))
Intercept log_beta_factor
0 0.390063 0.165557
1 -0.067914 0.670450
2 1.454112 0.093236
3 -0.258966 0.766745
import rpy2.robjects as robjects
modelF = robjects.r("summary(modelF <- aov(log(alpha.factor)~log(beta.factor)*phase,data=endometriosis))")
print(modelF)
Df Sum Sq Mean Sq F value Pr(>F)
log(beta.factor) 1 20.04 20.045 17.922 3.86e-05 ***
phase 1 0.38 0.381 0.341 0.56029
log(beta.factor):phase 1 8.62 8.624 7.711 0.00614 **
Residuals 161 180.07 1.118
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
print(robjects.r("head(model.matrix(modelF))"))
[[ 1. 1.41098697 0. 0. ]
[ 1. 1.85629799 0. 0. ]
[ 1. 3.43075618 0. 0. ]
[ 1. 3.07269331 0. 0. ]
[ 1. 1.79175947 0. 0. ]
[ 1. 3.09557761 0. 0. ]]
modelC = robjects.r("summary(modelC <- aov(log(alpha.factor)~log(beta.factor)*disease,data=endometriosis))")
print(modelC)
Df Sum Sq Mean Sq F value Pr(>F)
log(beta.factor) 1 20.04 20.045 18.652 2.73e-05 ***
disease 1 14.49 14.487 13.480 0.000328 ***
log(beta.factor):disease 1 1.57 1.565 1.457 0.229245
Residuals 161 173.02 1.075
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
modelFC = robjects.r("summary(modelC <- aov(log(alpha.factor)~log(beta.factor)*phase*disease,data=endometriosis))")
print(modelFC)
Df Sum Sq Mean Sq F value Pr(>F)
log(beta.factor) 1 20.04 20.045 20.046 1.45e-05 ***
phase 1 0.38 0.381 0.381 0.53797
disease 1 14.85 14.846 14.847 0.00017 ***
log(beta.factor):phase 1 9.28 9.280 9.280 0.00272 **
log(beta.factor):disease 1 0.46 0.464 0.464 0.49663
phase:disease 1 6.93 6.933 6.933 0.00931 **
log(beta.factor):phase:disease 1 0.18 0.178 0.178 0.67381
Residuals 157 156.99 1.000
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Rozdział 2.6.2
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('YXZ')
YXZ = r['YXZ']
print(YXZ.head(2))
Y X Z
1 -3.295356 0.730666 0.578293
2 -1.537665 3.091307 4.625046
print(YXZ.corr())
Y X Z
Y 1.000000 0.244715 0.264603
X 0.244715 1.000000 0.776488
Z 0.264603 0.776488 1.000000
print(smf.ols('Y~X + Z',data=YXZ).fit().summary())
OLS Regression Results
==============================================================================
Dep. Variable: Y R-squared: 0.074
Model: OLS Adj. R-squared: 0.055
Method: Least Squares F-statistic: 3.870
Date: pon, 01 sty 2018 Prob (F-statistic): 0.0242
Time: 11:38:08 Log-Likelihood: -309.41
No. Observations: 100 AIC: 624.8
Df Residuals: 97 BIC: 632.6
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.2016 0.551 -0.366 0.715 -1.295 0.892
X 0.3013 0.473 0.638 0.525 -0.637 1.239
Z 0.4462 0.368 1.211 0.229 -0.285 1.177
==============================================================================
Omnibus: 1.598 Durbin-Watson: 1.685
Prob(Omnibus): 0.450 Jarque-Bera (JB): 1.511
Skew: 0.192 Prob(JB): 0.470
Kurtosis: 2.537 Cond. No. 3.03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(sms.anova_lm(smf.ols('Y ~ X + Z',data=YXZ).fit(), typ=1))
df sum_sq mean_sq F PR(>F)
X 1.0 184.371793 184.371793 6.272410 0.013931
Z 1.0 43.131981 43.131981 1.467369 0.228704
Residual 97.0 2851.226860 29.394091 NaN NaN
print(sms.anova_lm(smf.ols('Y ~ Z + X',data=YXZ).fit(), typ=1))
df sum_sq mean_sq F PR(>F)
Z 1.0 215.555828 215.555828 7.333305 0.008001
X 1.0 11.947947 11.947947 0.406474 0.525266
Residual 97.0 2851.226860 29.394091 NaN NaN
print(sms.anova_lm(smf.ols('Y ~ X',data=YXZ).fit(), typ=1))
df sum_sq mean_sq F PR(>F)
X 1.0 184.371793 184.371793 6.242638 0.014135
Residual 98.0 2894.358842 29.534274 NaN NaN
print(sms.anova_lm(smf.ols('Y ~ Z',data=YXZ).fit(), typ=1))
df sum_sq mean_sq F PR(>F)
Z 1.0 215.555828 215.555828 7.377989 0.007806
Residual 98.0 2863.174807 29.216069 NaN NaN
Rozdział 2.6.3
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('elastase')
elastase = r['elastase']
print(elastase.head(2))
sex age weight elastase GFR
1 0 53 64 311 43.022
2 0 40 50 314 49.128
def poly(x, p):
x = np.array(x)
X = np.transpose(np.vstack((x**k for k in range(p+1))))
return np.linalg.qr(X)[0][:,1:]
print(smf.ols('GFR~weight+poly(elastase,3)',data=elastase).fit().summary())
OLS Regression Results
==============================================================================
Dep. Variable: GFR R-squared: 0.357
Model: OLS Adj. R-squared: 0.304
Method: Least Squares F-statistic: 6.789
Date: pon, 01 sty 2018 Prob (F-statistic): 0.000198
Time: 11:38:09 Log-Likelihood: -218.28
No. Observations: 54 AIC: 446.6
Df Residuals: 49 BIC: 456.5
Df Model: 4
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
Intercept 78.5673 12.469 6.301 0.000 53.511 103.624
weight -0.5330 0.174 -3.060 0.004 -0.883 -0.183
poly(elastase, 3)[0] 37.5072 14.568 2.575 0.013 8.231 66.784
poly(elastase, 3)[1] -45.3930 14.579 -3.114 0.003 -74.690 -16.095
poly(elastase, 3)[2] -18.1676 14.469 -1.256 0.215 -47.245 10.909
==============================================================================
Omnibus: 2.775 Durbin-Watson: 1.902
Prob(Omnibus): 0.250 Jarque-Bera (JB): 1.620
Skew: 0.127 Prob(JB): 0.445
Kurtosis: 2.190 Cond. No. 550.
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Rozdział 2.6.4
import scipy.stats as sc
import statsmodels.api as sm
y = sc.norm.rvs(size=100,random_state=13)
X = sc.norm.rvs(size=(100,98),random_state=13)
mod = sm.OLS(y,X).fit()
print(mod.summary().tables[0])
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.999
Model: OLS Adj. R-squared: 0.952
Method: Least Squares F-statistic: 21.06
Date: pon, 01 sty 2018 Prob (F-statistic): 0.0464
Time: 11:38:09 Log-Likelihood: 211.96
No. Observations: 100 AIC: -227.9
Df Residuals: 2 BIC: 27.39
Df Model: 98
Covariance Type: nonrobust
==============================================================================
import scipy.stats as sc
import statsmodels.api as sm
y = sc.norm.rvs(size=100,random_state=41)
X = sc.norm.rvs(size=(100,98),random_state=41)
mod = sm.OLS(y,X).fit()
print(mod.summary().tables[0])
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.955
Model: OLS Adj. R-squared: -1.235
Method: Least Squares F-statistic: 0.4361
Date: pon, 01 sty 2018 Prob (F-statistic): 0.894
Time: 11:38:09 Log-Likelihood: 6.9740
No. Observations: 100 AIC: 182.1
Df Residuals: 2 BIC: 437.4
Df Model: 98
Covariance Type: nonrobust
==============================================================================
Rozdział 2.6.5
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
y, X = dmatrices("GFR ~ sex+age+weight+elastase ", data=elastase, return_type="dataframe")
VIF = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif = pd.DataFrame()
vif["VIF"] = VIF
vif.index = X.columns
print(vif)
VIF
Intercept 56.852836
sex 1.531081
age 1.244410
weight 1.772674
elastase 1.025643
Rozdział 2.6.6
model_el_1 = sm.OLS(elastase['GFR'],sm.add_constant(elastase.drop(['GFR'], axis=1))).fit()
print(model_el_1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: GFR R-squared: 0.225
Model: OLS Adj. R-squared: 0.162
Method: Least Squares F-statistic: 3.558
Date: pon, 01 sty 2018 Prob (F-statistic): 0.0126
Time: 11:38:09 Log-Likelihood: -223.30
No. Observations: 54 AIC: 456.6
Df Residuals: 49 BIC: 466.6
Df Model: 4
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 63.5891 16.291 3.903 0.000 30.850 96.328
sex 5.3547 5.598 0.956 0.344 -5.896 16.605
age 0.1282 0.215 0.596 0.554 -0.304 0.561
weight -0.6283 0.251 -2.505 0.016 -1.132 -0.124
elastase 0.0300 0.013 2.281 0.027 0.004 0.056
==============================================================================
Omnibus: 0.418 Durbin-Watson: 1.929
Prob(Omnibus): 0.811 Jarque-Bera (JB): 0.581
Skew: 0.125 Prob(JB): 0.748
Kurtosis: 2.558 Cond. No. 3.40e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.4e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
model_el_2 = smf.ols('GFR ~ weight+elastase',data=elastase).fit()
print(model_el_2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: GFR R-squared: 0.209
Model: OLS Adj. R-squared: 0.178
Method: Least Squares F-statistic: 6.729
Date: pon, 01 sty 2018 Prob (F-statistic): 0.00255
Time: 11:38:09 Log-Likelihood: -223.87
No. Observations: 54 AIC: 453.7
Df Residuals: 51 BIC: 459.7
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 60.8422 15.010 4.054 0.000 30.709 90.976
weight -0.4628 0.188 -2.464 0.017 -0.840 -0.086
elastase 0.0312 0.013 2.412 0.019 0.005 0.057
==============================================================================
Omnibus: 0.289 Durbin-Watson: 1.937
Prob(Omnibus): 0.865 Jarque-Bera (JB): 0.455
Skew: 0.134 Prob(JB): 0.796
Kurtosis: 2.639 Cond. No. 3.13e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.13e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
print(sms.anova_lm(model_el_2,model_el_1))
df_resid ssr df_diff ss_diff F Pr(>F)
0 51.0 12612.160456 0.0 NaN NaN NaN
1 49.0 12352.465441 2.0 259.695015 0.515082 0.600651
from itertools import combinations
import itertools
def optim_model_01(data,var_dep='GFR',crit='AIC',h=10):
z = data.drop([var_dep], axis=1)
N = len(z)
comb0 = [np.array(list(itertools.combinations(z,i))) for i in np.arange(1,N+1,1)]
comb = list(itertools.chain.from_iterable(comb0))
lc = len(comb)
wariant = [comb[i].tolist() for i in range(lc)]
formula = [ var_dep + '~' + '+'.join(wariant[i]) for i in range(lc) ]
mod = [smf.ols(i,data=data).fit() for i in formula[:h]]
AIC = [mod[i].aic for i in range(h-1)]
BIC = [mod[i].bic for i in range(h-1)]
R2adj = [mod[i].rsquared_adj for i in range(h-1)]
param = [len(mod[i].params)-1 for i in range(h-1)]
d = pd.DataFrame()
d['AIC'] = AIC
d['BIC'] = BIC
d['R2adj'] = R2adj
d['param'] = param
AIC_minimum = d.sort_values(by=['AIC'])
BIC_minimum = d.sort_values(by=['BIC'])
R2adj_maximum = d.sort_values(by=['R2adj'],ascending=False)
if crit == "AIC": \
return [ d.sort_values(by=['AIC'],ascending=True),mod[AIC_minimum.index[0]] ]
elif crit == "BIC":\
return [ d.sort_values(by=['BIC'],ascending=True),mod[BIC_minimum.index[0]] ]
elif crit == "R2adj":\
return [ d.sort_values(by=['R2adj'],ascending=False),mod[R2adj_maximum.index[0]] ]
sol = optim_model_01(data=elastase,var_dep='GFR',crit='AIC',h=16)
print(sol[0].head())
AIC BIC R2adj param
9 453.730727 459.697679 0.177758 2
12 454.996740 462.952677 0.172636 3
13 455.606090 463.562027 0.163247 3
14 456.607215 466.552135 0.161819 4
2 457.565787 461.543755 0.101548 1
print(sol[1].summary())
OLS Regression Results
==============================================================================
Dep. Variable: GFR R-squared: 0.209
Model: OLS Adj. R-squared: 0.178
Method: Least Squares F-statistic: 6.729
Date: pon, 01 sty 2018 Prob (F-statistic): 0.00255
Time: 11:38:09 Log-Likelihood: -223.87
No. Observations: 54 AIC: 453.7
Df Residuals: 51 BIC: 459.7
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 60.8422 15.010 4.054 0.000 30.709 90.976
weight -0.4628 0.188 -2.464 0.017 -0.840 -0.086
elastase 0.0312 0.013 2.412 0.019 0.005 0.057
==============================================================================
Omnibus: 0.289 Durbin-Watson: 1.937
Prob(Omnibus): 0.865 Jarque-Bera (JB): 0.455
Skew: 0.134 Prob(JB): 0.796
Kurtosis: 2.639 Cond. No. 3.13e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.13e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Rozdział 2.6.7
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('apartments')
apartments2 = r('na.omit(apartments[,c(13,1,3,5,7,8,9,10,14,15,16)])')
apartments2 = apartments2.rename(columns=lambda x: x.replace('.', '_'))
apartments2['construction1990'] = apartments2['construction_date'] - 1990
apartments2['starszy1990'] = apartments2['construction_date'] > 1990
centrumWsp = apartments2.query("district == 'Srodmiescie'").iloc[1,[9,10]]
apartments2 = apartments2.reset_index()
odl = [np.sqrt(((centrumWsp[0]-apartments2.loc[i,'lat'])*100)**2 +
((centrumWsp[1]-apartments2.loc[i,'lon'])*70 )**2)
for i in range(729)]
apartments2['odleglosc'] = odl
print(apartments2.head(2))
index m2_price year surface district n_rooms floor \
0 1 10500 2008 20.0 Srodmiescie 1 7
1 2 7000 2008 25.0 Pruszkow 1 1
construction_date type condition lat lon \
0 1970 hipoteczne do remontu 52.229472 21.007614
1 1965 spoldzielcze do remontu 52.171616 20.806389
construction1990 starszy1990 odleglosc
0 -20 False 0.000000
1 -25 False 15.227681
import patsy
def poly(x, p):
x = np.array(x)
X = np.transpose(np.vstack((x**k for k in range(p+1))))
return np.linalg.qr(X)[0][:,1:]
X = apartments2.drop(['index','construction_date','lat','lon'], axis=1)
print(X.head(2))
m2_price year surface district n_rooms floor type \
0 10500 2008 20.0 Srodmiescie 1 7 hipoteczne
1 7000 2008 25.0 Pruszkow 1 1 spoldzielcze
condition construction1990 starszy1990 odleglosc
0 do remontu -20 False 0.000000
1 do remontu -25 False 15.227681
from itertools import combinations
import itertools
z = ['year','surface','poly(surface,3)','district','n_rooms','floor','type',
'construction1990','construction1990:starszy1990','odleglosc','poly(odleglosc,3)',
'district:year','district:type','district:condition','district:surface']
len(z)
15
def optim_model_02(data=X,var_dep='m2_price',z=z,crit='AIC',h=100):
N = len(z)
comb0 = [np.array(list(itertools.combinations(z,i))) for i in np.arange(1,N+1,1)]
comb = list(itertools.chain.from_iterable(comb0))
lc = len(comb)
wariant = [comb[i].tolist() for i in range(lc)]
formula = [ var_dep + '~' + '+'.join(wariant[i]) for i in range(lc) ]
mod = [smf.ols(i,data=data).fit() for i in formula[:h]]
AIC = [mod[i].aic for i in range(h-1)]
BIC = [mod[i].bic for i in range(h-1)]
R2adj = [mod[i].rsquared_adj for i in range(h-1)]
param = [len(mod[i].params)-1 for i in range(h-1)]
d = pd.DataFrame()
d['AIC'] = AIC
d['BIC'] = BIC
d['R2adj'] = R2adj
d['param'] = param
AIC_minimum = d.sort_values(by=['AIC'])
BIC_minimum = d.sort_values(by=['BIC'])
R2adj_maximum = d.sort_values(by=['R2adj'],ascending=False)
if crit == "AIC": \
return [ d.sort_values(by=['AIC'],ascending=True),mod[AIC_minimum.index[0]] ]
elif crit == "BIC":\
return [ d.sort_values(by=['BIC'],ascending=True),mod[BIC_minimum.index[0]] ]
elif crit == "R2adj":\
return [ d.sort_values(by=['R2adj'],ascending=False),mod[R2adj_maximum.index[0]] ]
res = optim_model_02(X,var_dep='m2_price',z=z,crit="BIC",h=400)
print(res[0].head(3))
AIC BIC R2adj param
334 12828.221352 12860.363068 0.383672 6
335 12822.269990 12863.595054 0.390339 8
190 12841.247163 12864.205531 0.370851 4
fig = plt.figure(figsize=(7,6))
plt.plot(res[0]['param'],res[0]['BIC'],'.')
plt.xlabel('liczba regresorów')
plt.ylabel('BIC')
plt.title('Na podstawie 400 pierwszych kombinacji parametrów\nspośród 32767 wszystkich kombinacji')
fig.tight_layout()
print(res[1].summary())
OLS Regression Results
==============================================================================
Dep. Variable: m2_price R-squared: 0.389
Model: OLS Adj. R-squared: 0.384
Method: Least Squares F-statistic: 76.53
Date: pon, 01 sty 2018 Prob (F-statistic): 6.69e-74
Time: 11:38:13 Log-Likelihood: -6407.1
No. Observations: 729 AIC: 1.283e+04
Df Residuals: 722 BIC: 1.286e+04
Df Model: 6
Covariance Type: nonrobust
=======================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------------
Intercept 8490.5444 172.069 49.344 0.000 8152.729 8828.359
poly(surface, 3)[0] 5503.4610 1703.883 3.230 0.001 2158.304 8848.618
poly(surface, 3)[1] -5272.0708 1639.676 -3.215 0.001 -8491.172 -2052.969
poly(surface, 3)[2] -4233.0860 1616.893 -2.618 0.009 -7407.459 -1058.713
construction1990:starszy1990[False] -41.5891 4.940 -8.419 0.000 -51.288 -31.891
construction1990:starszy1990[True] 174.0078 12.844 13.548 0.000 148.792 199.224
odleglosc -269.2994 20.151 -13.364 0.000 -308.861 -229.738
==============================================================================
Omnibus: 268.584 Durbin-Watson: 1.955
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1786.943
Skew: 1.498 Prob(JB): 0.00
Kurtosis: 10.061 Cond. No. 678.
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
def sym_model(data=X,var_dep='m2_price',z=z,crit='AIC',sym=100):
N = len(z)
comb0 = [np.array(list(itertools.combinations(z,i))) for i in np.arange(1,N+1,1)]
comb = list(itertools.chain.from_iterable(comb0))
lc = len(comb)
wariant = [comb[i].tolist() for i in range(lc)]
formula = [ var_dep + '~' + '+'.join(wariant[i]) for i in range(lc) ]
sym_formula = np.random.choice(formula, size=sym, replace=F)
lsyn = len(sym_formula)
mod = [smf.ols(i,data=X).fit() for i in sym_formula]
AIC = [mod[i].aic for i in range(lsyn)]
BIC = [mod[i].bic for i in range(lsyn)]
R2adj = [mod[i].rsquared_adj for i in range(lsyn)]
param = [len(mod[i].params)-1 for i in range(lsyn)]
d = pd.DataFrame()
d['AIC'] = AIC
d['BIC'] = BIC
d['R2adj'] = R2adj
d['param'] = param
AIC_minimum = d.sort_values(by=['AIC'])
BIC_minimum = d.sort_values(by=['BIC'])
R2adj_maximum = d.sort_values(by=['R2adj'],ascending=False)
if crit == "AIC": \
return [ d.sort_values(by=['AIC'],ascending=True),mod[AIC_minimum.index[0]] ]
elif crit == "BIC":\
return [ d.sort_values(by=['BIC'],ascending=True),mod[BIC_minimum.index[0]] ]
elif crit == "R2adj":\
return [ d.sort_values(by=['R2adj'],ascending=False),mod[R2adj_maximum.index[0]] ]
np.random.seed(2305)
sym = sym_model(z,crit="BIC",sym=400)
print(sym[0].head(3))
AIC BIC R2adj param
228 12849.361005 12872.319374 0.363809 4
313 12829.385681 12879.894092 0.386023 11
25 12742.480087 12884.821972 0.469337 31
fig = plt.figure(figsize=(7,6))
plt.plot(sym[0]['param'],sym[0]['BIC'],'.')
plt.xlabel('liczba regresorów')
plt.ylabel('BIC')
plt.title('Na podstawie 400 losowo wybranych kombinacji parametrów\nspośród 32767 wszystkich kombinacji')
fig.tight_layout()
print(sym[1].summary())
OLS Regression Results
==============================================================================
Dep. Variable: m2_price R-squared: 0.367
Model: OLS Adj. R-squared: 0.364
Method: Least Squares F-statistic: 105.1
Date: pon, 01 sty 2018 Prob (F-statistic): 1.44e-70
Time: 11:38:25 Log-Likelihood: -6419.7
No. Observations: 729 AIC: 1.285e+04
Df Residuals: 724 BIC: 1.287e+04
Df Model: 4
Covariance Type: nonrobust
========================================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------------------
Intercept 8246.9563 216.311 38.125 0.000 7822.285 8671.628
n_rooms 117.1125 64.914 1.804 0.072 -10.330 244.555
construction1990 -43.1573 4.969 -8.686 0.000 -52.912 -33.402
construction1990:starszy1990[T.True] 224.8637 15.946 14.102 0.000 193.558 256.170
odleglosc -284.9458 20.166 -14.130 0.000 -324.536 -245.356
==============================================================================
Omnibus: 252.375 Durbin-Watson: 1.907
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1476.336
Skew: 1.436 Prob(JB): 0.00
Kurtosis: 9.353 Cond. No. 90.3
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Rozdział 2.6.8
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r('library(PBImisc)')
r.data('Drosophila')
Drosophila = r['Drosophila']
bs = pandas2ri.ri2py_dataframe(Drosophila[1])
print(bs.head())
ewg w rp v sd run gl pgk cg gpdh ... rox ald mlc jan \
BS01-1 0 0 0 0 0 0 1 0 0 0 ... 1 1 1 1
BS01-2 2 2 2 2 2 2 1 1 1 1 ... 0 0 0 0
BS01-3 0 0 0 0 0 0 1 1 1 1 ... 1 1 1 1
BS01-4 2 2 2 2 0 0 0 0 0 0 ... 0 0 0 0
BS01-5 0 0 0 0 0 0 0 0 0 0 ... 1 1 1 1
efi pc1 adjpc1 area areat tibia
BS01-1 1 0.00984 0.10939 0.00787 0.01887 0.417
BS01-2 1 0.01268 0.14151 0.00816 0.01860 0.439
BS01-3 1 0.01033 0.12961 0.00805 0.01851 0.435
BS01-4 0 0.00417 0.13079 0.00639 0.01449 0.441
BS01-5 1 0.01848 0.16576 0.00977 0.02388 0.409
[5 rows x 46 columns]
korr = bs.iloc[:,:41].corr()
print(korr.iloc[:6,:6])
ewg w rp v sd run
ewg 1.000000 0.925348 0.708987 0.561678 0.304831 0.160065
w 0.925348 1.000000 0.763598 0.626499 0.350672 0.196015
rp 0.708987 0.763598 1.000000 0.818519 0.448198 0.233764
v 0.561678 0.626499 0.818519 1.000000 0.615473 0.367563
sd 0.304831 0.350672 0.448198 0.615473 1.000000 0.679004
run 0.160065 0.196015 0.233764 0.367563 0.679004 1.000000
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(1,1,1)
sns.heatmap(1-abs(korr),
xticklabels=korr.columns,yticklabels=korr.columns,ax=ax)
fig.tight_layout()
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
dat = bs.iloc[:,:42]
var_dep = 'pc1'
variable_ind = dat.iloc[:,:41].columns
formula = var_dep + '~' + '+'.join(variable_ind)
y, X = dmatrices(formula, data=dat, return_type="dataframe")
VIF = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif = pd.DataFrame()
vif["VIF"] = VIF
vif.index = X.columns
print(vif)
VIF
Intercept 10.956593
ewg 7.690664
w 9.137029
rp 4.800058
v 4.202430
sd 2.917888
run 2.141868
gl 4.794329
pgk 6.284281
cg 10.253837
gpdh 12.402848
ninaC 9.290877
glt 6.062582
mhc 6.698641
ddc 10.295207
duc 4.900250
sli 1.695382
egfr 7.369367
twi 10.980767
zip 4.962627
lsp 6.144002
ve 8.744777
acr 8.272664
dbi 8.539726
h 4.253744
fz 8.014706
eip 11.055961
tra 6.976073
rdg 9.046203
ht 11.227116
ant 10.836373
ninaE 7.225617
fas 4.983311
mst 4.810534
odh 10.007537
tub 9.854711
hb 5.529557
rox 4.549265
ald 8.740273
mlc 8.656464
jan 11.685838
efi 9.664005
Rozdział 2.6.9
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
nam = bs.iloc[:,:41].columns
formula = [ 'pc1' + '~' +''.join(nam[i]) for i in range(len(nam)) ]
a1 = [ smf.ols(formula[i],data=bs).fit() for i in range(len(formula)) ]
pval = [ sms.anova_lm(a1[i], typ=2)['PR(>F)'][0] for i in range(len(formula)) ]
Df = pd.DataFrame()
Df['pval'] = -np.log(pval)
Df['chrom'] = list(Drosophila[2])
Df['pos'] = list(Drosophila[3])
g = sns.factorplot(x="pos", y="pval",col="chrom",data=Df, kind="point")
fig.tight_layout()