import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.formula.api as smf
import seaborn as sns
sns.set_theme()
import warnings
warnings.filterwarnings('ignore', message='The figure layout has changed')

df = pd.read_csv('larven.csv')
df.head()

df.describe().round(0)

formel = 'Anzahl_Larven ~ A + B + C + D + E'
modell = smf.ols(formel, df)

res = modell.fit()

res.summary()

formel3 = 'Anzahl_Larven ~ A + B + C + D'
modell3 = smf.ols(formel3, df)
res3 = modell3.fit()

res3.summary()

formel4 = 'Anzahl_Larven ~ A + C + D'
modell4 = smf.ols(formel4, df)
res4 = modell4.fit()

res4.summary()

df['D_in_ppm'] = df.D / 1000

formel5 = 'Anzahl_Larven ~ A + C + D_in_ppm'
modell5 = smf.ols(formel5, df)
res5 = modell5.fit()

res5.summary()

df = pd.read_csv('galapagos.csv')
df.head()

df.describe()

formel = 'Species ~ Area + Elevation + Nearest + Scruz + Adjacent'
modell = smf.ols(formel, df)
res = modell.fit()

res.summary()

import statsmodels.api as sm
pp_s = sm.ProbPlot(df.Species)
pp_s.qqplot();

pp_lw = sm.ProbPlot(np.log(df.Species))
pp_lw.qqplot();

from matplotlib import pyplot as plt
fig = plt.figure()
for j in range(6):
    spalte = df.columns[j+1]
    ax = fig.add_subplot(231+j)
    pp = sm.ProbPlot(df[spalte])
    pp.qqplot(ax=ax, xlabel=spalte, ylabel='')
fig.subplots_adjust(wspace=0.5, hspace=0.4);

fig = plt.figure()
for j in range(6):
    if j != 4:
        spalte = df.columns[j+1]
        ax = fig.add_subplot(231+j)
        pp = sm.ProbPlot(np.log(df[spalte]))
        pp.qqplot(ax=ax, xlabel=spalte, ylabel='')
fig.subplots_adjust(wspace=0.5, hspace=0.4);

transformierte_formel = '''np.log(Species) ~ np.log(Area) + np.log(Elevation) 
                                         + np.log(Nearest) + np.log(Adjacent)'''
modell2 = smf.ols(transformierte_formel, df)

res2 = modell2.fit()
res2.summary()

df = pd.read_csv('kinder.csv')
df.head()

sns.lmplot(df, x='father', y='childHeight', hue='gender');

formel = 'childHeight ~ father + mother + gender'
modell = smf.ols(formel, df)

res = modell.fit()

res.summary()

	A	B	C	D	E	Anzahl_Larven
0	2441.8	301.7	3588.2	520976.8	5161.9	286
1	2925.3	259.2	4531.4	481366.6	8707.2	296
2	2639.7	265.3	3495.7	529228.9	4353.0	249
3	2264.7	293.4	3921.1	511927.5	6284.5	288
4	2798.7	344.4	4351.0	477372.6	3609.7	305

	A	B	C	D	E	Anzahl_Larven
count	30.0	30.0	30.0	30.0	30.0	30.0
mean	3055.0	300.0	4108.0	508931.0	5842.0	265.0
std	457.0	48.0	438.0	45879.0	3799.0	28.0
min	2265.0	194.0	3306.0	415617.0	452.0	196.0
25%	2763.0	266.0	3831.0	478371.0	3259.0	248.0
50%	2988.0	302.0	4073.0	498393.0	4328.0	264.0
75%	3319.0	325.0	4295.0	538847.0	8281.0	280.0
max	3967.0	401.0	5176.0	619914.0	18242.0	347.0

Dep. Variable:	Anzahl_Larven	R-squared:	0.904
Model:	OLS	Adj. R-squared:	0.885
Method:	Least Squares	F-statistic:	45.43
Date:	Tue, 21 Jan 2025	Prob (F-statistic):	1.83e-11
Time:	10:56:54	Log-Likelihood:	-106.74
No. Observations:	30	AIC:	225.5
Df Residuals:	24	BIC:	233.9
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	397.2447	30.017	13.234	0.000	335.293	459.197
A	-0.0436	0.004	-11.010	0.000	-0.052	-0.035
B	-0.0504	0.038	-1.324	0.198	-0.129	0.028
C	0.0348	0.004	8.286	0.000	0.026	0.043
D	-0.0002	3.93e-05	-6.316	0.000	-0.000	-0.000
E	-1.621e-05	0.000	-0.034	0.973	-0.001	0.001

Omnibus:	1.288	Durbin-Watson:	1.484
Prob(Omnibus):	0.525	Jarque-Bera (JB):	0.989
Skew:	0.166	Prob(JB):	0.610
Kurtosis:	2.174	Cond. No.	8.85e+06

Mathematik für Biologiestudierende II¶

Lineare Modelle¶

Themen heute¶

Mehrere erklärende Variable¶

Transformationen¶

Transformation¶

Beispiel: Galapagos-Inseln¶

Normalverteilungsannahmen¶

Kategorielle Daten¶

Omnibus:	1.259	Durbin-Watson:	1.488
Prob(Omnibus):	0.533	Jarque-Bera (JB):	0.980
Skew:	0.168	Prob(JB):	0.613
Kurtosis:	2.180	Cond. No.	8.75e+06

Omnibus:	1.857	Durbin-Watson:	1.519
Prob(Omnibus):	0.395	Jarque-Bera (JB):	1.200
Skew:	0.188	Prob(JB):	0.549
Kurtosis:	2.095	Cond. No.	8.54e+06

	Unnamed: 0	Species	Area	Elevation	Nearest	Scruz	Adjacent
0	Baltra	58	25.09	346	0.6	0.6	1.84
1	Bartolome	31	1.24	109	0.6	26.3	572.33
2	Caldwell	3	0.21	114	2.8	58.7	0.78
3	Champion	25	0.10	46	1.9	47.4	0.18
4	Coamano	2	0.05	77	1.9	1.9	903.82

	Species	Area	Elevation	Nearest	Scruz	Adjacent
count	30.000000	30.000000	30.000000	30.000000	30.000000	30.000000
mean	85.233333	261.708667	368.033333	10.060000	56.976667	261.098333
std	114.633053	864.110519	421.604937	14.274636	68.032334	864.518967
min	2.000000	0.010000	25.000000	0.200000	0.000000	0.030000
25%	13.000000	0.257500	97.750000	0.800000	11.025000	0.520000
50%	42.000000	2.590000	192.000000	3.050000	46.650000	2.590000
75%	96.000000	59.237500	435.250000	10.025000	81.075000	59.237500
max	444.000000	4669.320000	1707.000000	47.400000	290.200000	4669.320000

Dep. Variable:	Species	R-squared:	0.766
Model:	OLS	Adj. R-squared:	0.717
Method:	Least Squares	F-statistic:	15.70
Date:	Tue, 21 Jan 2025	Prob (F-statistic):	6.84e-07
Time:	10:56:54	Log-Likelihood:	-162.54
No. Observations:	30	AIC:	337.1
Df Residuals:	24	BIC:	345.5
Df Model:	5
Covariance Type:	nonrobust

Omnibus:	12.683	Durbin-Watson:	2.476
Prob(Omnibus):	0.002	Jarque-Bera (JB):	13.498
Skew:	1.136	Prob(JB):	0.00117
Kurtosis:	5.374	Cond. No.	1.90e+03

Dep. Variable:	np.log(Species)	R-squared:	0.783
Model:	OLS	Adj. R-squared:	0.749
Method:	Least Squares	F-statistic:	22.59
Date:	Tue, 21 Jan 2025	Prob (F-statistic):	5.38e-08
Time:	10:56:55	Log-Likelihood:	-32.526
No. Observations:	30	AIC:	75.05
Df Residuals:	25	BIC:	82.06
Df Model:	4
Covariance Type:	nonrobust

Omnibus:	0.558	Durbin-Watson:	2.355
Prob(Omnibus):	0.757	Jarque-Bera (JB):	0.643
Skew:	-0.120	Prob(JB):	0.725
Kurtosis:	2.324	Cond. No.	73.3

	family	father	mother	midparentHeight	children	childNum	gender	childHeight
0	001	78.5	67.0	75.43	4	1	male	73.2
1	001	78.5	67.0	75.43	4	2	female	69.2
2	001	78.5	67.0	75.43	4	3	female	69.0
3	001	78.5	67.0	75.43	4	4	female	69.0
4	002	75.5	66.5	73.66	4	1	male	73.5

Dep. Variable:	childHeight	R-squared:	0.635
Model:	OLS	Adj. R-squared:	0.634
Method:	Least Squares	F-statistic:	540.3
Date:	Tue, 21 Jan 2025	Prob (F-statistic):	3.38e-203
Time:	10:56:56	Log-Likelihood:	-2044.6
No. Observations:	934	AIC:	4097.
Df Residuals:	930	BIC:	4117.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	388.4026	28.867	13.455	0.000	329.066	447.739
A	-0.0438	0.004	-11.175	0.000	-0.052	-0.036
C	0.0342	0.004	8.282	0.000	0.026	0.043
D_in_ppm	-0.2555	0.039	-6.597	0.000	-0.335	-0.176

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	7.0682	19.154	0.369	0.715	-32.464	46.601
Area	-0.0239	0.022	-1.068	0.296	-0.070	0.022
Elevation	0.3195	0.054	5.953	0.000	0.209	0.430
Nearest	0.0091	1.054	0.009	0.993	-2.166	2.185
Scruz	-0.2405	0.215	-1.117	0.275	-0.685	0.204
Adjacent	-0.0748	0.018	-4.226	0.000	-0.111	-0.038

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	4.7049	1.575	2.987	0.006	1.461	7.948
np.log(Area)	0.4892	0.098	5.014	0.000	0.288	0.690
np.log(Elevation)	-0.3277	0.316	-1.036	0.310	-0.979	0.324
np.log(Nearest)	-0.1239	0.093	-1.339	0.193	-0.315	0.067
np.log(Adjacent)	-0.0284	0.046	-0.623	0.539	-0.122	0.065

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	16.5212	2.727	6.058	0.000	11.169	21.873
gender[T.male]	5.2150	0.142	36.775	0.000	4.937	5.493
father	0.3928	0.029	13.699	0.000	0.337	0.449
mother	0.3176	0.031	10.245	0.000	0.257	0.378

Omnibus:	11.156	Durbin-Watson:	1.549
Prob(Omnibus):	0.004	Jarque-Bera (JB):	15.397
Skew:	-0.114	Prob(JB):	0.000453
Kurtosis:	3.586	Cond. No.	3.63e+03