# Comenzamos por cargar las librerias que ocuparemos
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt


df = pd.read_table("Datos.txt")
df.head()


# Si el individuo ha recibido el tratamiento C, entonces asignaremos el valor 
# de 1, caso contrario asignaremos el valor de 0
def trat_C(tratamiento):
    if tratamiento == "C":
        return 1
    else: 
        return 0


# Agregamos una columna nueva y aplicamos la funcion anterior
# a cada una de las filas de la columna (serie) TRT
df["xC"] = df["TRT"].apply(lambda x: trat_C(x))
df.head()


df = df.rename(columns={'age':'x1',
                        "TRT" : "tratamiento",
                        "x2" : "xA",
                        "x3" : "xB"})
df.head()


# Algunas caracteristicas sobre la columna tratamiento
df["tratamiento"].describe()

count     36
unique     3
top        A
freq      12
Name: tratamiento, dtype: object


# Veamos dichas caracteristicas para cada una de las columnas
for i in df.columns:
    print(df[i].describe(), end = "\n", sep = "|")

count    36.000000
mean     55.166667
std      12.415428
min      25.000000
25%      46.750000
50%      58.000000
75%      63.250000
max      73.000000
Name: y, dtype: float64
count    36.000000
mean     44.111111
std      14.628305
min      19.000000
25%      32.250000
50%      44.000000
75%      56.500000
max      67.000000
Name: x1, dtype: float64
count    36.000000
mean      0.333333
std       0.478091
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: xA, dtype: float64
count    36.000000
mean      0.333333
std       0.478091
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: xB, dtype: float64
count     36
unique     3
top        A
freq      12
Name: tratamiento, dtype: object
count    36.000000
mean      0.333333
std       0.478091
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: xC, dtype: float64


# Datos agrupados por efectividad

# Consideraremos solo las columnas del tratamiento
# y la efectividad
grupos = df[["tratamiento", "y"]]
# Agrupamos por tratamiento y observamos algunas caracteristicas
# por tratamiento y efectividad
grupos.groupby('tratamiento').describe()


# Varianza de la efectividad por tratamiento
grupos.groupby('tratamiento').var()


import seaborn as sns
import matplotlib.pyplot as plt

# Configuramos un estilo de graficacion
plt.style.use('seaborn')

# Configuramos el tamagnio de salida de nuestros graficos
plt.rcParams['figure.figsize'] = [14, 6]

# Mostraremos dos graficos
fig, axs = plt.subplots(ncols=2)

sns.scatterplot(data=df, x="y", y="x1", hue=df.tratamiento.tolist(), ax=axs[0])
sns.boxplot(data=df, x="tratamiento", y="y", hue=df.tratamiento.tolist(), ax=axs[1])
plt.show()


# Para realizar el ajuste del modelo utilizamos la funcion ols() como sigue
#              ---  modelo ---                          -- Ajuste --   
model = ols('y ~ x1 + xB + xC + x1:xB + x1:xC', data=df).fit()

# Veamos la informacion de la tabla anova obtenida de nuestro 
# ajuste del modelo anterior
sm.stats.anova_lm(model, typ=2)


model.params

Intercept    47.515591
x1            0.330507
xB          -18.597385
xC          -41.304210
x1:xB         0.193177
x1:xC         0.702884
dtype: float64


# Podemos ver mas informacion sobre nuestro ajuste del modelo 
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.900
Method:                 Least Squares   F-statistic:                     64.04
Date:                Tue, 20 Sep 2022   Prob (F-statistic):           4.26e-15
Time:                        21:22:29   Log-Likelihood:                -97.024
No. Observations:                  36   AIC:                             206.0
Df Residuals:                      30   BIC:                             215.5
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     47.5156      3.825     12.422      0.000      39.703      55.328
x1             0.3305      0.081      4.056      0.000       0.164       0.497
xB           -18.5974      5.416     -3.434      0.002     -29.658      -7.537
xC           -41.3042      5.085     -8.124      0.000     -51.688     -30.920
x1:xB          0.1932      0.117      1.657      0.108      -0.045       0.431
x1:xC          0.7029      0.109      6.451      0.000       0.480       0.925
==============================================================================
Omnibus:                        2.593   Durbin-Watson:                   1.633
Prob(Omnibus):                  0.273   Jarque-Bera (JB):                1.475
Skew:                          -0.183   Prob(JB):                        0.478
Kurtosis:                       2.079   Cond. No.                         577.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Rango de graficacion
x = range(10, 80)

# Promedio de la eficacia del tratamiento considerando
# el tratamiento A
def tA(x):
    return 47.51 + 0.33 * x

# Promedio de la eficacia del tratamiento considerando
# el tratamiento B
def tB(x):
    return 28.91 + 0.53 * x

# Promedio de la eficacia del tratamiento considerando
# el tratamiento C
def tC(x):
    return 6.21 + 1.03 * x


# Consideramos tres dataframes para separar a los individuos
# de cada tratamiento
df1 = df[df["tratamiento"] == "A"]
df2 = df[df["tratamiento"] == "B"]
df3 = df[df["tratamiento"] == "C"]


# Graficamos el ajuste de las rectas para cada uno de los tratamientos
plt.plot(x, [tA(i) for i in x])
plt.scatter(df1.x1, df1.y)

plt.plot(x, [tB(i) for i in x])
plt.scatter(df2.x1, df2.y)

plt.plot(x, [tC(i) for i in x])
plt.scatter(df3.x1, df3.y)

plt.legend(['Modelo Tratamiento A', 'Tratamiento A', 
            "Modelo Tratamiento B", 'Tratamiento B',
            "Modelo Tratamiento C", 'Tratamiento C'])
plt.xlabel("Edad")
plt.ylabel("Efectividad")

Text(0, 0.5, 'Efectividad')


f = model.f_pvalue
f

4.264070157904808e-15


A = np.identity(len(model.params))
A = A[1:, :]
A

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])


model.f_test(A)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=64.04253685100939, p=4.264070157905692e-15, df_denom=30, df_num=5>


A = [[0,0,0,0,1,0], [0,0,0,0,0,1]]

model.f_test(A)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=22.83123711361978, p=9.410457463646548e-07, df_denom=30, df_num=2>


import rpy2 
print(rpy2.__version__)

3.5.4


# Mandamos a llamar la estructura de vector en R
# para que podemos utilizarla
from rpy2.robjects import FloatVector

# Importamos una funcion para poder utilizar librerias de R
from rpy2.robjects.packages import importr
# Objetos de R
from rpy2 import robjects

# Consideramos (importamos) las librerias de R que ocuparemos
stats = importr('stats')
base = importr('base')
multcomp = importr('multcomp')

Unable to determine R home: [WinError 2] El sistema no puede encontrar el archivo especificado


# Convertimos cada una de nuestras variables a un objeto vector de R.
y = FloatVector(df.y)
x1 = FloatVector(df.x1)
xB = FloatVector(df.xB)
xC = FloatVector(df.xC)


# Variables de Python asignadas a variables de R (R objetos) 
# dentro de un ambiente global.
# (robjects.globalenv["y"] <- y)
robjects.globalenv["y"] = y
robjects.globalenv["x1"] = x1
robjects.globalenv["xB"] = xB
robjects.globalenv["xC"] = xC


# Realizamos el ajuste del modelo utilizando la libreria stats de R
# como si fuera una libreria de Python
Ajuste = stats.lm(formula = "y ~ x1 + xB + xC + x1:xB + x1:xC")


# Summary del ajuste
print(base.summary(Ajuste))

Call:
(function (formula, data, subset, weights, na.action, method = "qr", 
    model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
    contrasts = NULL, offset, ...) 
{
    ret.x <- x
    ret.y <- y
    cl <- match.call()
    mf <- match.call(expand.dots = FALSE)
    m <- match(c("formula", "data", "subset", "weights", "na.action", 
        "offset"), names(mf), 0L)
    mf <- mf[c(1L, m)]
    mf$drop.unused.levels <- TRUE
    mf[[1L]] <- quote(stats::model.frame)
    mf <- eval(mf, parent.frame())
    if (method == "model.frame") 
        return(mf)
    else if (method != "qr") 
        warning(gettextf("method = '%s' is not supported. Using 'qr'", 
            method), domain = NA)
    mt <- attr(mf, "terms")
    y <- model.response(mf, "numeric")
    w <- as.vector(model.weights(mf))
    if (!is.null(w) && !is.numeric(w)) 
        stop("'weights' must be a numeric vector")
    offset <- model.offset(mf)
    mlm <- is.matrix(y)
    ny <- if (mlm) 
        nrow(y)
    else length(y)
    if (!is.null(offset)) {
        if (!mlm) 
            offset <- as.vector(offset)
        if (NROW(offset) != ny) 
            stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                NROW(offset), ny), domain = NA)
    }
    if (is.empty.model(mt)) {
        x <- NULL
        z <- list(coefficients = if (mlm) matrix(NA_real_, 0, 
            ncol(y)) else numeric(), residuals = y, fitted.values = 0 * 
            y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != 
            0) else ny)
        if (!is.null(offset)) {
            z$fitted.values <- offset
            z$residuals <- y - offset
        }
    }
    else {
        x <- model.matrix(mt, mf, contrasts)
        z <- if (is.null(w)) 
            lm.fit(x, y, offset = offset, singular.ok = singular.ok, 
                ...)
        else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
            ...)
    }
    class(z) <- c(if (mlm) "mlm", "lm")
    z$na.action <- attr(mf, "na.action")
    z$offset <- offset
    z$contrasts <- attr(x, "contrasts")
    z$xlevels <- .getXlevels(mt, mf)
    z$call <- cl
    z$terms <- mt
    if (model) 
        z$model <- mf
    if (ret.x) 
        z$x <- x
    if (ret.y) 
        z$y <- y
    if (!qr) 
        z$qr <- NULL
    z
})(formula = "y ~ x1 + xB + xC + x1:xB + x1:xC")

Residuals:
    Min      1Q  Median      3Q     Max 
-6.4366 -2.7637  0.1887  2.9075  6.5634 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  47.51559    3.82523  12.422 2.34e-13 ***
x1            0.33051    0.08149   4.056 0.000328 ***
xB          -18.59739    5.41573  -3.434 0.001759 ** 
xC          -41.30421    5.08453  -8.124 4.56e-09 ***
x1:xB         0.19318    0.11660   1.657 0.108001    
x1:xC         0.70288    0.10896   6.451 3.98e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.925 on 30 degrees of freedom
Multiple R-squared:  0.9143,	Adjusted R-squared:  0.9001 
F-statistic: 64.04 on 5 and 30 DF,  p-value: 4.264e-15


# Objeto que "vive" en Python pero es de R
K = robjects.FloatVector([0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,-1])
# Objeto de R
K = robjects.r['matrix'](K, ncol=6, nrow=3)
print(K)

# Objeto que "vive" en Python pero es de R
m = robjects.FloatVector([0,0,0])
# Objeto de R
m = robjects.r['matrix'](m, ncol=1)
print(m)

     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    0    0    0    0    1    0
[2,]    0    0    0    0    0    1
[3,]    0    0    0    0    1   -1

     [,1]
[1,]    0
[2,]    0
[3,]    0


# Realizamos la prueba lineal general para pruebas simultáneas
print(base.summary(multcomp.glht(Ajuste, linfct=K, rhs=m)))

# base: R base

	 Simultaneous Tests for General Linear Hypotheses

Fit: (function (formula, data, subset, weights, na.action, method = "qr", 
    model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
    contrasts = NULL, offset, ...) 
{
    ret.x <- x
    ret.y <- y
    cl <- match.call()
    mf <- match.call(expand.dots = FALSE)
    m <- match(c("formula", "data", "subset", "weights", "na.action", 
        "offset"), names(mf), 0L)
    mf <- mf[c(1L, m)]
    mf$drop.unused.levels <- TRUE
    mf[[1L]] <- quote(stats::model.frame)
    mf <- eval(mf, parent.frame())
    if (method == "model.frame") 
        return(mf)
    else if (method != "qr") 
        warning(gettextf("method = '%s' is not supported. Using 'qr'", 
            method), domain = NA)
    mt <- attr(mf, "terms")
    y <- model.response(mf, "numeric")
    w <- as.vector(model.weights(mf))
    if (!is.null(w) && !is.numeric(w)) 
        stop("'weights' must be a numeric vector")
    offset <- model.offset(mf)
    mlm <- is.matrix(y)
    ny <- if (mlm) 
        nrow(y)
    else length(y)
    if (!is.null(offset)) {
        if (!mlm) 
            offset <- as.vector(offset)
        if (NROW(offset) != ny) 
            stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                NROW(offset), ny), domain = NA)
    }
    if (is.empty.model(mt)) {
        x <- NULL
        z <- list(coefficients = if (mlm) matrix(NA_real_, 0, 
            ncol(y)) else numeric(), residuals = y, fitted.values = 0 * 
            y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != 
            0) else ny)
        if (!is.null(offset)) {
            z$fitted.values <- offset
            z$residuals <- y - offset
        }
    }
    else {
        x <- model.matrix(mt, mf, contrasts)
        z <- if (is.null(w)) 
            lm.fit(x, y, offset = offset, singular.ok = singular.ok, 
                ...)
        else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
            ...)
    }
    class(z) <- c(if (mlm) "mlm", "lm")
    z$na.action <- attr(mf, "na.action")
    z$offset <- offset
    z$contrasts <- attr(x, "contrasts")
    z$xlevels <- .getXlevels(mt, mf)
    z$call <- cl
    z$terms <- mt
    if (model) 
        z$model <- mf
    if (ret.x) 
        z$x <- x
    if (ret.y) 
        z$y <- y
    if (!qr) 
        z$qr <- NULL
    z
})(formula = "y ~ x1 + xB + xC + x1:xB + x1:xC")

Linear Hypotheses:
       Estimate Std. Error t value Pr(>|t|)    
1 == 0   0.1932     0.1166   1.657    0.238    
2 == 0   0.7029     0.1090   6.451   <0.001 ***
3 == 0  -0.5097     0.1104  -4.617   <0.001 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Adjusted p values reported -- single-step method)


# Ajuste del modelo reducido
model_r = ols('y ~ x1 + xB + xC + x1:xC', data=df).fit()

sm.stats.anova_lm(model, typ=2)


# Parametros obtenidos del ajuste del modelo
model_r.params

Intercept    43.285243
x1            0.424864
xB          -10.027208
xC          -37.073862
x1:xC         0.608526
dtype: float64


 print(model_r.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.907
Model:                            OLS   Adj. R-squared:                  0.894
Method:                 Least Squares   F-statistic:                     75.14
Date:                Tue, 27 Sep 2022   Prob (F-statistic):           1.68e-15
Time:                        10:00:30   Log-Likelihood:                -98.600
No. Observations:                  36   AIC:                             207.2
Df Residuals:                      31   BIC:                             215.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     43.2852      2.927     14.787      0.000      37.315      49.255
x1             0.4249      0.060      7.093      0.000       0.303       0.547
xB           -10.0272      1.648     -6.085      0.000     -13.388      -6.667
xC           -37.0739      4.519     -8.204      0.000     -46.290     -27.858
x1:xC          0.6085      0.095      6.374      0.000       0.414       0.803
==============================================================================
Omnibus:                        2.617   Durbin-Watson:                   1.804
Prob(Omnibus):                  0.270   Jarque-Bera (JB):                1.661
Skew:                          -0.284   Prob(JB):                        0.436
Kurtosis:                       2.113   Cond. No.                         369.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


plt.rcParams['figure.figsize'] = [14, 10]
# Graficos: variable independiente vs variables dependientes
fig = sm.graphics.plot_partregress_grid(model_r)
fig.tight_layout(pad=1.0)

eval_env: 1
eval_env: 1
eval_env: 1
eval_env: 1
eval_env: 1


plt.rcParams['figure.figsize'] = [12, 6]
sns.histplot(model_r.resid, bins = 10)
plt.show()


from scipy import stats as st
mu, std = st.norm.fit(model_r.resid)
mu, std

(6.17777434146976e-14, 3.743252307963197)


fig, ax = plt.subplots()

# Grafico de los residuales:
# histograma junto con la densidad
sns.histplot(x=model_r.resid, ax=ax, stat="density", linewidth=0, kde=True)
ax.set(title="Distribución de los residuales", xlabel="Residuales")

# Grafico correspondiente a una densidad proveniente de una distribucion
# normal (mu, std^2)
xmin, xmax = plt.xlim() # valores maximos para el intervalo de graficacion del eje X
x = np.linspace(xmin, xmax, 100) # generamos algunos valores para la x
p = st.norm.pdf(x, mu, std) # Funcion de densidad obtenida de una normal (mu, std^2)
sns.lineplot(x=x, y=p, color="red", ax=ax) # Graficamos la densidad anterior
plt.show()


sm.qqplot(model_r.resid, line='s');


# Shapiro-Wilk test
# ==============================================================================
shapiro_test = st.shapiro(model_r.resid)
shapiro_test

ShapiroResult(statistic=0.9656770825386047, pvalue=0.319449245929718)


# D'Agostino's K-squared test
# ==============================================================================
k2, p_value = st.normaltest(model_r.resid)
print(f"Estadístico = {k2}, p-value = {p_value}")

Estadístico = 2.6169126708165154, p-value = 0.27023688969450765


sm.graphics.plot_fit(model_r, 1, vlines=False);


import statsmodels.stats.diagnostic as sms
name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']
test = sms.het_breuschpagan(model.resid, model.model.exog)
result = zip(name, test)
print(list(result))

[('Lagrange multiplier statistic', 2.9958191453619496), ('p-value', 0.7006304411123819), ('f-value', 0.5446253900782909), ('f p-value', 0.740988751661612)]


A = [[0,0,1, 0,0],
    [0,0,0, 1,0],
    [0,0,0, 0,1],
    [0,0,1,-1,0]]
A
model_r.f_test(A)

C:\Users\fjza9\anaconda3\lib\site-packages\statsmodels\base\model.py:1871: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 4, but rank is 3
  warnings.warn('covariance of constraints does not have full '

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=30.034106752500577, p=2.663200219438571e-09, df_denom=31, df_num=3>


A = [[0,0,0,0,1]]
A
model_r.f_test(A)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=40.63101538056917, p=4.2498150170287766e-07, df_denom=31, df_num=1>

	y
	count	mean	std	min	25%	50%	75%	max
tratamiento
A	12.0	62.333333	6.386539	52.0	57.50	62.5	66.00	73.0
B	12.0	51.916667	8.360713	40.0	45.75	51.5	58.50	64.0
C	12.0	51.250000	17.189452	25.0	35.50	54.5	64.25	71.0

	y
tratamiento
A	40.787879
B	69.901515
C	295.477273

	sum_sq	df	F	PR(>F)
x1	3302.258622	1.0	214.363829	3.331105e-15
xB	602.599365	1.0	39.117320	6.862837e-07
xC	620.533401	1.0	40.281495	5.311571e-07
x1:xB	42.282024	1.0	2.744708	1.080006e-01
x1:xC	641.064054	1.0	41.614229	3.982473e-07
Residual	462.147738	30.0	NaN	NaN

	sum_sq	df	F	PR(>F)
x1	3302.258622	1.0	214.363829	3.331105e-15
xB	602.599365	1.0	39.117320	6.862837e-07
xC	620.533401	1.0	40.281495	5.311571e-07
x1:xB	42.282024	1.0	2.744708	1.080006e-01
x1:xC	641.064054	1.0	41.614229	3.982473e-07
Residual	462.147738	30.0	NaN	NaN

ANCOVA¶

Contenido¶

Supuestos de ANCOVA Enlace permanente¶

Implementación ¶

Datos¶

Variables¶

Análisis exploratorio ¶

ANCOVA ¶

Ajuste del modelo en python¶

Reducción del Modelo ¶

Modelo reducido ¶

Verificación de supuestos ¶

Algunas preguntas¶

Ejercicio¶

	y	age	x2	x3	TRT
0	56	21	1	0	A
1	41	23	0	1	B
2	40	30	0	1	B
3	28	19	0	0	C
4	55	28	1	0	A

	y	age	x2	x3	TRT	xC
0	56	21	1	0	A	0
1	41	23	0	1	B	0
2	40	30	0	1	B	0
3	28	19	0	0	C	1
4	55	28	1	0	A	0