6 - Os DAGs assombrados & O Terror Causal

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from videpy import Vide

import networkx as nx
# from causalgraphicalmodels import CausalGraphicalModel

import stan
import nest_asyncio

plt.style.use('default')

plt.rcParams['axes.facecolor'] = 'lightgray'

# To DAG's
import daft
from causalgraphicalmodels import CausalGraphicalModel
# To running the stan in jupyter notebook
nest_asyncio.apply()

RCode 6.1 - Pag 162

np.random.seed(1914)

N = 200
p = 0.1

# Não correlacionado noticiabilidade(newsworthiness) e confiabilidade(trustworthiness)
nw = np.random.normal(0, 1, N)
tw = np.random.normal(0, 1, N)

# Selecionando os 10% melhores
s = nw + tw  # Score total
q = np.quantile(s, 1-p)  # Top 10% 
selected = [ True if s_i >= q else False for s_i in s ]
print('Noticiabilidade(newsworthiness): \n\n', nw[selected], '\n\n')
print('Confiabilidade(trustworthiness):\n\n', tw[selected], '\n\n')

print('Correlação: ', np.correlate(tw[selected], nw[selected]))
Noticiabilidade(newsworthiness): 

 [ 0.82456357  1.85614543  0.85556981  1.4066898   1.6026727   1.42256068
  0.71166144  1.30222516  0.56454812  3.02039213  0.93879118  1.04202561
 -0.4640972   0.12272254  1.99579665  1.59807671  1.82853378  1.15394068
  1.48966066  1.22640255] 


Confiabilidade(trustworthiness):

 [ 1.24510334  1.29709756  2.10293057  0.66088341  1.76728533  0.64448519
  1.17598088  0.35356094  1.08805257 -0.72327249  1.31462133  1.54253157
  2.8128998   1.51862223  1.18048237  2.31948695 -0.09763283  0.87205345
  0.74979249  0.46329186] 


Correlação:  [19.93737242]
plt.figure(figsize=(17, 6))

plt.scatter(tw, nw, s=6, color='gray')
plt.scatter(tw[selected], nw[selected], s=7, color='blue')

plt.title('Figure 6.1 - Pag162')
plt.xlabel('Noticiabilidade ($newsworthiness$)')
plt.ylabel('Confiabilidade ($trustworthiness$)')

plt.grid(ls='--', color='white', alpha=0.3)
_images/parte_6_5_0.png

RCode 6.2 - pag163

N = 100

np.random.seed(909)  # Teste com outras sementes

height = np.random.normal(10, 2,  N)

leg_proportion = np.random.uniform(0.4, 0.5, N)

leg_left  = np.random.left = leg_proportion * height + np.random.normal(0, 0.02, N)
leg_right = np.random.left = leg_proportion * height + np.random.normal(0, 0.02, N)


df = pd.DataFrame({'height': height, 
                   'leg_left': leg_left, 
                   'leg_right': leg_right})
df.head()
height leg_left leg_right
0 8.463728 4.094675 4.078446
1 9.854070 4.776475 4.687749
2 8.668694 4.192607 4.256472
3 7.523768 3.088674 3.088206
4 9.381352 4.093217 4.048181
plt.figure(figsize=(17, 6))

plt.scatter(leg_right, leg_left, s=5, alpha=0.3)

plt.title('Dados dos legs')
plt.xlabel('Leg Right')
plt.ylabel('Leg Left')

plt.grid(ls='--', color='white', alpha=0.4)

plt.show()
_images/parte_6_8_0.png
plt.figure(figsize=(17, 6))
plt.hist(df.height, rwidth=0.9, density=True)
plt.grid(ls='--', color='white', alpha=0.4)
plt.show()

plt.figure(figsize=(17, 6))
plt.hist(df.leg_left, rwidth=0.9, density=True, alpha=0.5)
plt.hist(df.leg_right, rwidth=0.9, density=True, alpha=0.5)

plt.grid(ls='--', color='white', alpha=0.4)
plt.show()
_images/parte_6_9_0.png _images/parte_6_9_1.png
model = """
    data {
        int<lower=0> N;
        vector[N] height;
        vector[N] leg_left;
        vector[N] leg_right;
    }
    
    parameters {
        real alpha;
        real beta_left;
        real beta_right;
        real<lower=0> sigma; 
    }
    
    model {
        alpha ~ normal(10, 100);
        beta_left ~ normal(2, 10);
        beta_right ~ normal(2, 10);
        sigma ~ exponential(1);
        
        height ~ normal(alpha + beta_left * leg_left + beta_right * leg_right, sigma);
    }
"""

data = {
    'N': N,
    'height': height,
    'leg_left': leg_left,
    'leg_right': leg_right
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

alpha = samples['alpha'].flatten()
beta_left = samples['beta_left'].flatten()
beta_right = samples['beta_right'].flatten()
sigma = samples['sigma'].flatten()

RCode 6.4 - pag164

Vide.summary(samples)
mean std 7.0% 93.0%
alpha 0.94 0.33 0.29 1.48
beta_left -1.38 2.08 -5.23 2.34
beta_right 3.38 2.06 -0.21 7.31
sigma 0.66 0.05 0.58 0.74
Vide.plot_forest(samples, title='Leg right & Leg left')
_images/parte_6_13_0.png

RCode 6.5 - pag164

plt.figure(figsize=(17, 6))

plt.scatter(beta_right, beta_left, s=5, alpha=0.3)

plt.title('Posteriori Beta legs')
plt.xlabel('Beta Right')
plt.ylabel('Beta Left')

plt.grid(ls='--', color='white', alpha=0.4)

plt.show()
_images/parte_6_15_0.png

RCode 6.6 - pag 165

plt.figure(figsize=(17, 6))

plt.hist((beta_left + beta_right), density=True, alpha=0.8, bins=100, rwidth=0.9)

plt.title('Posteriori')
plt.xlabel('Soma de Beta_left e beta_right')
plt.ylabel('Densidade')

plt.grid(ls='--', color='white', alpha=0.4)

plt.show()
_images/parte_6_17_0.png
# Comparação com os beta indivíduais
plt.figure(figsize=(17, 6))

plt.hist(beta_left, density=True, alpha=0.8, bins=100, rwidth=0.9)  # Beta left

plt.hist(beta_right, density=True, alpha=0.8, bins=100, rwidth=0.9)  # Beta Right

plt.title('Comparativo entre as Posterioris de Beta Left e Beta Right')
plt.xlabel('Betas')
plt.ylabel('Densidade')

plt.grid(ls='--', color='white', alpha=0.1)

plt.show()
_images/parte_6_18_0.png

RCode 6.7 - Pag 166

model = """
    data {
        int N;
        vector[N] leg_left;
        vector[N] height;
    }
    
    parameters {
        real alpha;
        real beta_left;
        real sigma;
    }
    
    model {
        alpha ~ normal(10, 100);
        beta_left ~ normal(2, 10);
        sigma ~ exponential(1);
        
        height ~ normal(alpha + beta_left * leg_left, sigma);
    }
"""

data = {
    'N': len(height),
    'leg_left': leg_left,
    'height': height,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

alpha = samples['alpha'].flatten()
beta_left = samples['beta_left'].flatten()
sigma = samples['sigma'].flatten()
# RCode 6.7 - Continuação
Vide.summary(samples)
mean std 7.0% 93.0%
alpha 0.83 0.33 0.28 1.43
beta_left 2.02 0.07 1.90 2.14
sigma 0.67 0.05 0.58 0.75
Vide.plot_forest(samples, title='Leg Left')
_images/parte_6_22_0.png

R Code 6.8

df = pd.read_csv('data/milk.csv', sep=';')

df_std = df[['kcal.per.g', 'perc.fat', 'perc.lactose']].copy()

df_std['kcal.per.g'] = (df_std['kcal.per.g'] - df_std['kcal.per.g'].mean()) / df_std['kcal.per.g'].std()
df_std['perc.fat'] = (df_std['perc.fat'] - df_std['perc.fat'].mean()) / df_std['perc.fat'].std()
df_std['perc.lactose'] = (df_std['perc.lactose'] - df_std['perc.lactose'].mean()) / df_std['perc.lactose'].std()

df_std.head()
kcal.per.g perc.fat perc.lactose
0 -0.940041 -1.217243 1.307262
1 -0.816126 -1.030355 1.011285
2 -1.125913 -1.391531 1.382679
3 -1.001998 -1.335535 1.586874
4 -0.258511 -0.469693 0.257115
# Não tem nenhum 'missing values'

df_std.isna().sum()
kcal.per.g      0
perc.fat        0
perc.lactose    0
dtype: int64

R Code 6.9 - Pag 167

# kcal.per.g  regredido em perc.fat

model_kf = """
    data {
        int N;
        vector[N] outcome;
        vector[N] predictor;
    }
    
    parameters {
        real alpha;
        real beta;
        real<lower=0> sigma;
    }
    
    model {
        alpha ~ normal(0, 0.2);
        beta ~ normal(0, 0.5);
        sigma ~ exponential(1);
        
        outcome ~ normal(alpha + beta * predictor, sigma);
    }
"""

data_kf = {
    'N': len(df_std['kcal.per.g']),
    'outcome': list(df_std['kcal.per.g'].values),
    'predictor': list(df_std['perc.fat'].values),
}

posteriori_kf = stan.build(model_kf, data=data_kf)
samples_kf = posteriori_kf.sample(num_chains=4, num_samples=1000)
# kcal.per.g  regredido em  perc.lactose

model_kl = """
    data {
        int N;
        vector[N] outcome;
        vector[N] predictor;
    }
    
    parameters {
        real alpha;
        real beta;
        real<lower=0> sigma;
    }
    
    model {
        alpha ~ normal(0, 0.2);
        beta ~ normal(0, 0.5);
        sigma ~ exponential(1);
        
        outcome ~ normal(alpha + beta * predictor, sigma);
    }
"""

data_kl = {
    'N': len(df_std['kcal.per.g']),
    'outcome': df_std['kcal.per.g'].values,
    'predictor': df_std['perc.lactose'].values,
}

posteriori_kl = stan.build(model_kl, data=data_kl)
samples_kl = posteriori_kl.sample(num_chains=4, num_samples=1000)
Vide.summary(samples_kf)
mean std 7.0% 93.0%
alpha 0.00 0.08 -0.14 0.16
beta 0.86 0.09 0.68 1.02
sigma 0.49 0.07 0.37 0.62
Vide.plot_forest(samples_kf, title='perc.fat')
_images/parte_6_30_0.png
Vide.summary(samples_kl)
mean std 7.0% 93.0%
alpha 0.00 0.07 -0.12 0.14
beta -0.90 0.08 -1.05 -0.77
sigma 0.41 0.06 0.31 0.52
Vide.plot_forest(samples_kl, title='perc.lactose')
_images/parte_6_32_0.png

R Code 6.10 - pag 167

model = """
    data {
        int N;
        vector[N] F;  // Fat
        vector[N] L;  // Lactose
        vector[N] K;  // kcal/g
    }
    
    parameters {
        real alpha;
        real bF;
        real bL;
        real sigma;
    }
    
    model {
        alpha ~ normal(0, 0.2);
        bF ~ normal(0, 0.5);
        bL ~ normal(0, 0.5);
        sigma ~ exponential(1);
        
        K ~ normal(alpha + bF*F + bL*L, sigma);
    }
"""

data = {
    'N': len(df_std['kcal.per.g']),
    'F': df_std['perc.fat'].values,
    'L': df_std['perc.lactose'].values,
    'K': df_std['kcal.per.g'].values,
}

posteriori_FL = stan.build(model, data=data)
samples_FL = posteriori_FL.sample(num_chains=4, num_samples=1000)
Vide.summary(samples_FL)
mean std 7.0% 93.0%
alpha -0.00 0.07 -0.13 0.12
bF 0.25 0.19 -0.11 0.58
bL -0.66 0.19 -1.00 -0.31
sigma 0.41 0.06 0.31 0.51
Vide.plot_forest(samples_FL, title='Perc.Fat & Perc.Lactose')
_images/parte_6_36_0.png

R Code 6.11 - Pag 168 - Figure 6.3

pd.plotting.scatter_matrix(df_std, diagonal='hist', grid=True, figsize=(17, 6))
plt.show()
_images/parte_6_38_0.png

R Code 6.12 - Overthinking - Rever

model = """
    data {
        int N;
        vector[N] kcal_per_g;
        vector[N] perc_fat;
        vector[N] new_predictor_X;
    }
    
    parameters {
        real alpha;
        real bF;
        real bX;
        real<lower=0> sigma;
    }
    
    model {
        kcal_per_g ~ normal(alpha + bF * perc_fat + bX * new_predictor_X, sigma);
    }
"""
def generate_predictor_x(r=0.9):
    N = len(df['perc.fat'].values)
    
    mean = r * df['perc.fat'].values
    sd = np.sqrt((1 - r**2) * np.var(df['perc.fat'].values))
    
    return np.random.normal(mean, sd, N)  # New Predictor X
def generate_data_dict(r=0.9):
    data = {
        'N': len(df['kcal.per.g']),
        'kcal_per_g': df['kcal.per.g'].values,
        'perc_fat': df['perc.fat'].values,
        'new_predictor_X': generate_predictor_x(r=r),
    }
    return data
def adjust_model(r=0.9):
    
    parameter_mean_samples  = []
    
    for _ in range(1):  # In book running 100x
        # Runnning the model
        posteriori = stan.build(model, data=generate_data_dict(r=r))
        samples = posteriori.sample(num_chains=4, num_samples=1000)
        
        # Get parameter slope mean
        parameter_mean_samples.append(samples['bF'].flatten().mean())
            
    return parameter_mean_samples
stddev = []
r_sequence = np.arange(0, 0.99, 0.1)  # In book using 0.01

for r in r_sequence:
    parameter = adjust_model(r=r)
    stddev.append(np.mean(parameter))
plt.figure(figsize=(17, 6))

plt.plot(r_sequence, stddev)
plt.xlabel("correlation", fontsize=14)
plt.ylabel("stddev", fontsize=14)
plt.show()
_images/parte_6_45_0.png

R Code 6.13

np.random.seed(3)

# Quantidade de plantas
N = 100

# Simulação inicial das alturas
h0 = np.random.normal(10, 2, N)

# Atribuindo tratamentos e simulando fungos e tratamentos
treatment = np.repeat([0,1], repeats=int(N/2))
fungus = np.random.binomial(n=1, p=(0.5 - treatment*0.4), size=N)
h1 = h0 + np.random.normal(5 - 3*fungus, 1, N)

# Dataframe
d = pd.DataFrame.from_dict({'h0': h0, 
                            'h1': h1, 
                            'treatment': treatment, 
                            'fungus': fungus})
d.describe().T
count mean std min 25% 50% 75% max
h0 100.0 9.782726 2.138707 4.168524 8.279903 9.642072 11.353860 14.316299
h1 100.0 14.209396 2.766929 6.881795 12.512818 14.190085 15.688766 20.786965
treatment 100.0 0.500000 0.502519 0.000000 0.000000 0.500000 1.000000 1.000000
fungus 100.0 0.270000 0.446196 0.000000 0.000000 0.000000 1.000000 1.000000

R Code 6.14

sim_p = np.random.lognormal(0, 0.25, int(1e4))
pd.DataFrame(sim_p, columns=['sim_p']).describe().T
count mean std min 25% 50% 75% max
sim_p 10000.0 1.02366 0.259148 0.391611 0.83898 0.993265 1.174575 2.781105

R Code 6.15

Modelo:

\[ h_{1,i} \sim Normal(\mu_i, \sigma) \]
\[ \mu_i = h_{0, i} \times p \]

Prioris:

\[ p \sim LogNormal(0, 0.25) \]
\[ sigma \sim Exponential(1) \]
model = """
    data {
        int N; 
        vector[N] h1;
        vector[N] h0;
    }
    
    parameters {
        real<lower=0> p;
        real<lower=0> sigma;
        
    }
    
    model {
        vector[N] mu;
        mu = h0 * p;
        
        h1 ~ normal(mu, sigma);
        
        // Prioris
        p ~ lognormal(0, 0.25);
        sigma ~ exponential(1); 
    }
"""


data = {
    'N': N,
    'h1': h1,
    'h0': h0,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
p 1.43 0.02 1.40 1.46
sigma 1.99 0.15 1.72 2.25

RCode 6.16

Modelo post-treatment bias:

\[ h_{1, i} \sim Normal(\mu_i, \sigma) \]
\[ \mu_i = h_{0, i} \times p \]
\[ p = \alpha + \beta_T T_i + \beta_F F_i \]

prioris:

\[ \alpha \sim LogNormal(0, 0.25) \]
\[ \beta_T \sim Normal(0, 0.5) \]
\[ \beta_F \sim Normal(0, 0.5) \]
\[ \sigma \sim Exponential(1) \]
"""
To mu definition below
----------------------

vector[N] a;
vector[N] b;
vector[N] c; 

These operation:
c = a .* b;

Is the same operation:
for (n in 1:N) {
  c[n] = a[n] * b[n];
}

Reference:
https://mc-stan.org/docs/reference-manual/arithmetic-expressions.html
"""

model = """
    data {
        int N;
        vector[N] h0;
        vector[N] h1;
        vector[N] T;  // Treatment
        vector[N] F;  // Fungus
    }

    parameters {
        real alpha;
        real bT;
        real bF;
        real<lower=0> sigma;
    }

    model {
        vector[N] mu;
        vector[N] p;
        
        p = alpha + bT * T + bF * F;
        mu = h0 .* p;  
    
        // likelihood
        h1 ~ normal(mu, sigma);
    
        // prioris
        alpha ~ lognormal(0, 0.25);
        bT ~ normal(0, 0.5);
        bF ~ normal(0, 0.5);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': N,
    'h0': h0,
    'h1': h1,
    'T': treatment,
    'F': fungus,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
alpha 1.53 0.03 1.48 1.59
bT -0.03 0.04 -0.09 0.03
bF -0.32 0.04 -0.39 -0.25
sigma 1.44 0.11 1.26 1.64
Vide.plot_forest(samples, title='Treatment and Fungus')
_images/parte_6_58_0.png

R Code 6.17

model = """
    data {
        int N;
        vector[N] h0;
        vector[N] h1;
        vector[N] T;  // Treatment
    }

    parameters {
        real<lower=0> alpha;
        real bT;
        real<lower=0> sigma;
    }
    
    model {
        vector[N] mu;
        vector[N] p;

        p = alpha + bT * T;
        mu = h0 .* p;

        h1 ~ normal(mu, sigma);

        alpha ~ lognormal(0, 0.2);
        bT ~ normal(0, 0.5);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': N,
    'h0': h0,
    'h1': h1,
    'T': treatment,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
alpha 1.36 0.03 1.31 1.41
bT 0.13 0.04 0.06 0.20
sigma 1.89 0.14 1.65 2.14
Vide.plot_forest(samples, title='Treatment without fungus')
_images/parte_6_62_0.png

R Code 6.18

G = nx.DiGraph()

nodes = {0: '$H_0$', 
         1: '$H_1$', 
         2: 'F', 
         3: 'T'}

for i in nodes:
    G.add_node(nodes[i])

edges = [(nodes[0], nodes[1]),
         (nodes[2], nodes[1]),
         (nodes[3], nodes[2])]

G.add_edges_from(edges)

# explicitly set positions
pos = {nodes[0]: (0, 0), 
       nodes[1]: (1, 0), 
       nodes[2]: (1.5, 0), 
       nodes[3]: (2, 0)}

options = {
    "font_size": 15,
    "node_size": 400,
    "node_color": "white",
    "edgecolors": "white",
    "linewidths": 1,
    "width": 1,
}

nx.draw(G, pos, with_labels=True, **options)

# Set margins for the axes so that nodes aren't clipped
ax = plt.gca()
# ax.margins(0.01)
plt.axis("off")
plt.show()
_images/parte_6_64_0.png

R Code 6.19

\[F \_||\_ H_0\]
\[H_0 \_||\_ T\]
\[ H_1 \_||\_ T | F \]

R Code 6.20

# np.random.seed(3)

# Quantidade de plantas
N = 100

# Simulação inicial das alturas
h0 = np.random.normal(10, 2, N)

# Atribuindo tratamentos e simulando fungos e tratamentos
treatment = np.repeat([0, 1], repeats=int(N/2))

M = np.random.binomial(n=1, p=0.5, size=N)  # Moisture -> Bernoulli(p=0.5)

fungus = np.random.binomial(n=1, p=(0.5 - treatment * 0.4 + 0.4 * M), size=N)
h1 = h0 + np.random.normal((5 + 3 * M), 1, N)

# Dataframe
d2 = pd.DataFrame.from_dict({'h0': h0, 
                            'h1': h1, 
                            'treatment': treatment, 
                            'fungus': fungus})
d2.describe().T 
count mean std min 25% 50% 75% max
h0 100.0 10.091610 1.976358 6.313914 8.850678 10.169278 11.369749 15.273222
h1 100.0 16.510702 2.715874 10.105379 14.711143 15.899131 18.671828 22.662508
treatment 100.0 0.500000 0.502519 0.000000 0.000000 0.500000 1.000000 1.000000
fungus 100.0 0.480000 0.502117 0.000000 0.000000 0.000000 1.000000 1.000000
# RCode 6.17 with new database

model = """
    data {
        int N;
        vector[N] h0;
        vector[N] h1;
        vector[N] T;  // Treatment
    }

    parameters {
        real<lower=0> alpha;
        real bT;
        real<lower=0> sigma;
    }
    
    model {
        vector[N] mu;
        vector[N] p;

        p = alpha + bT * T;
        mu = h0 .* p;

        h1 ~ normal(mu, sigma);

        alpha ~ lognormal(0, 0.2);
        bT ~ normal(0, 0.5);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': N,
    'h0': d2.h0.values,
    'h1': d2.h1.values,
    'T': d2.treatment.values,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
alpha 1.60 0.03 1.55 1.65
bT 0.02 0.04 -0.05 0.09
sigma 2.16 0.15 1.87 2.43
Vide.plot_forest(samples, title="Only Treatment using d2")
_images/parte_6_71_0.png
# RCode 6.16 with new database

model = """
    data {
        int N;
        vector[N] h0;
        vector[N] h1;
        vector[N] T;  // Treatment
        vector[N] F;  // Fungus
    }

    parameters {
        real alpha;
        real bT;
        real bF;
        real<lower=0> sigma;
    }

    model {
        vector[N] mu;
        vector[N] p;
        
        p = alpha + bT * T + bF * F;
        mu = h0 .* p;  
    
        // likelihood
        h1 ~ normal(mu, sigma);
    
        // prioris
        alpha ~ lognormal(0, 0.25);
        bT ~ normal(0, 0.5);
        bF ~ normal(0, 0.5);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': N,
    'h0': d2.h0.values,
    'h1': d2.h1.values,
    'T': d2.treatment.values,
    'F': d2.fungus.values,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
alpha 1.53 0.04 1.46 1.60
bT 0.06 0.04 -0.02 0.14
bF 0.11 0.04 0.03 0.19
sigma 2.10 0.15 1.83 2.36
Vide.plot_forest(samples, title="Treatments and Fungus")
_images/parte_6_74_0.png

Collider Bias

6.21

Simulação

  1. Cada ano, \(20\) pessoas nascem com valores de felicidade uniformente distribuídos

  2. Cada ano, cada uma das pessoas envelhece \(1\) ano. A sua felicidade não muda.

  3. Aos \(18\) anos, um indivíduo de casa com a probabilidade de porporcional a sua felicidade.

  4. Uma vez casado, o indivíduo se mantém casado.

  5. Aos 65 anos, o indivíduo deixa a amostra (Vai morar na Espanha)

# Function based in https://github.com/rmcelreath/rethinking/blob/master/R/sim_happiness.R
# Inv_logit R-function in https://stat.ethz.ch/R-manual/R-devel/library/boot/html/inv.logit.html

def inv_logit(x):
    return np.exp(x) / (1 + np.exp(x))

def sim_happiness(seed=1977 , N_years=1000 , max_age=65 , N_births=20 , aom=18):
    np.random.seed(seed)
    
    df = pd.DataFrame(columns=['age', 'married', 'happiness'])

    for i in range(N_years):
        # Update age
        df['age'] += 1

        # Move to Spain when age == max_age
        df.drop(df[df['age'] == max_age].index, inplace=True)
        
        # Will marry?
        index_unmarried_aom = df.query((f'age>={aom} and married==0')).index.tolist()
        weddings = np.random.binomial(1, inv_logit(df.loc[index_unmarried_aom, 'happiness'] - 4))
        df.loc[index_unmarried_aom, 'married'] = weddings
        
        # New borns
        df_aux = pd.DataFrame(columns=['age', 'married', 'happiness'])

        df_aux.loc[:, 'age'] = np.zeros(N_births).astype(int)
        df_aux.loc[:, 'married'] = np.zeros(N_births).astype(int)
        df_aux.loc[:, 'happiness'] = np.linspace(-2, 2, N_births)  # np.random.uniform(0, 1, N_births)
        
        df = df.append(df_aux, ignore_index=True)
        
    return df
df = sim_happiness(seed=1997, N_years=1000)
df.describe(percentiles=[0.055, 0.945], include='all').T
count unique top freq mean std min 5.5% 50% 94.5% max
age 1300.0 65.0 64.0 20.0 NaN NaN NaN NaN NaN NaN NaN
married 1300.0 2.0 0.0 950.0 NaN NaN NaN NaN NaN NaN NaN
happiness 1300.0 NaN NaN NaN -8.335213e-17 1.214421 -2.0 -1.789474 -1.110223e-16 1.789474 2.0
# Figure 6.21
plt.figure(figsize=(17, 6))

colors = ['white' if is_married == 0 else 'blue' for is_married in df.married ]

plt.scatter(df.age, df.happiness, color=colors)

plt.title('White - Unmarried   |    Blue - Married')
plt.xlabel('Age')
plt.ylabel('Happiness')
plt.show()
_images/parte_6_80_0.png

R Code 6.22

df2 = df[df.age > 17].copy()  # Only adults
df2.loc[:, 'age'] = (df2.age - 18) / (65 - 18)

R Code 6.23

Modelo:

\[ happiness \sim Normal(\mu_i, \sigma) \]
\[ \mu_i = \alpha_{_{MID}[i]} + \beta_A \times A_i \]

prioris:

\[ \alpha_{_{MID}[i]} \sim Normal(0, 1)\]
\[ \beta_A \sim Normal(0, 2) \]
\[ \sigma \sim Exponential(1); \]
model = """
    data {
        int N;
        vector[N] age;
        vector[N] happiness;
        array[N] int married;  // Must be integer because this is index to alpha. 
    }
    
    parameters {
        vector[2] alpha;  // can also be written like this: real alpha[2] or array[2] int alpha;
        real beta_age;
        real<lower=0> sigma;
    }
    
    model {
        vector[N] mu;
        
        for (i in 1:N){
            mu[i] = alpha[ married[i] ] + beta_age * age[i];
        }
        
        happiness ~ normal(mu, sigma);
        
        // Prioris
        alpha ~ normal(0, 1);
        beta_age ~normal(0, 2);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': len(df2.happiness.values),
    'age': df2.age.values,
    'happiness': df2.happiness.values,
    'married': df2.married.values + 1  # Because the index in stan starting with 1 
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
alpha[0] -0.19 0.06 -0.30 -0.07
alpha[1] 1.38 0.09 1.22 1.53
beta_age -0.81 0.11 -1.02 -0.62
sigma 0.97 0.02 0.93 1.02
Vide.plot_forest(samples, title='Married and Unmarried')
_images/parte_6_87_0.png

R Code 6.24

model = """
    data {
        int N;
        vector[N] age;
        vector[N] happiness;
    }
    
    parameters {
        real alpha;
        real beta_age;
        real<lower=0> sigma;
    }
    
    model {
        vector[N] mu;
        
        for (i in 1:N){
            mu[i] = alpha + beta_age * age[i];
        }
        
        happiness ~ normal(mu, sigma);
        
        alpha ~ normal(0, 1);
        beta_age ~ normal(0, 2);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': len(df2.happiness.values),
    'happiness': df2.happiness.values,
    'age': df2.age.values,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
alpha -0.00 0.08 -0.15 0.14
beta_age 0.00 0.14 -0.23 0.27
sigma 1.22 0.03 1.17 1.27
Vide.plot_forest(samples, title='Without married variable')
_images/parte_6_91_0.png

RCode 6.25

N = 200  # Qty of triads  (G, P, C)

b_GP = 1  # Direct effect of G on P 
b_GC = 0  # Direct effect of G on C
b_PC = 1  # Direct effect of P on C
b_U  = 2  # Direct effect of U on  P and C

R Code 6.26

np.random.seed(3)

U = 2 * np.random.binomial(n=1, p=0.5, size=N) - 1  # {-1, 1}
# U = np.random.normal(0, 1, N)  # Simulation more realistic example 

G = np.random.normal(0, 1, size=N)  # Has not influence

P = np.random.normal(b_GP*G + b_U*U, 1, size=N)

C = np.random.normal(b_PC*P + b_GC*G + b_U*U, 1, size=N)

d = pd.DataFrame.from_dict({'C':C, 'P':P, 'G':G, 'U':U})

d.head()
C P G U
0 0.986995 1.197787 -0.795915 1
1 5.715446 3.573636 0.072746 1
2 -4.387763 -3.131400 -0.261240 -1
3 2.598454 0.462321 -1.298047 1
4 6.614388 4.931193 2.676112 1

R Code 6.27

# C ~ P + G

model = """
    data {
        int N;
        vector[N] C;
        vector[N] P;
        vector[N] G;
    }
    
    parameters {
        real alpha;
        real b_PC;
        real b_GC;
        real<lower=0> sigma;
    }
    
    model {
        vector[N] mu;
        
        for (i in 1:N){
            mu[i] = alpha + b_PC*P[i] + b_GC*G[i];
        }
        
        C ~ normal(mu, sigma);
        
        alpha ~ normal(0, 1);
        b_PC ~ normal(0, 1);
        b_GC ~ normal(0, 1);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': len(d.C.values),
    'C': d.C.values,
    'P': d.P.values,
    'G': d.G.values,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
alpha 0.07 0.10 -0.10 0.24
b_PC 1.77 0.04 1.70 1.84
b_GC -0.82 0.10 -1.01 -0.65
sigma 1.33 0.07 1.20 1.45
Vide.plot_forest(samples)
_images/parte_6_99_0.png
# Figure 6.5
plt.figure(figsize=(17, 6))

colors = ['black' if u <= 0 else 'blue' for u in U]  # Unobserved

x = np.linspace(-3, 4)
y = np.mean(samples['alpha']) + np.mean(samples['b_GC']) * x

plt.plot(x, y, c='k')

plt.scatter(G, C, c=colors)
plt.xlabel('Granparent Education (G)')
plt.ylabel('GranChild Education (C)')
plt.title('Educations - Bias Collider')
plt.show()
_images/parte_6_100_0.png

R Code 6.28

model = """
    data {
        int N;
        vector[N] C;
        vector[N] P;
        vector[N] G;
        vector[N] U;
    }
    
    parameters {
        real alpha;
        real b_PC;
        real b_GC;
        real b_U;
        real<lower=0> sigma;
    }
    
    model {
        vector[N] mu;
        
        for (i in 1:N){
            mu[i] = alpha + b_PC*P[i] + b_GC*G[i] + b_U*U[i];
        }
        
        C ~ normal(mu, sigma);
        
        alpha ~ normal(0, 1);
        b_GC ~ normal(0, 1);
        b_PC ~ normal(0, 1);
        b_U  ~ normal(0, 1);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': len(d.C.values),
    'C': d.C.values,
    'P': d.P.values,
    'G': d.G.values,
    'U': d.U.values,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean std 7.0% 93.0%
alpha 0.07 0.07 -0.06 0.21
b_PC 1.00 0.07 0.86 1.12
b_GC -0.03 0.10 -0.22 0.15
b_U 1.97 0.17 1.68 2.27
sigma 1.02 0.05 0.93 1.11
Vide.plot_forest(samples)
_images/parte_6_104_0.png

R Code 6.29

Reference: ksachdeva

dag_6_1 = CausalGraphicalModel(
    nodes=["C", "U", "B", "A", "X", "Y"],
    edges=[
        ("U", "X"),
        ("A", "U"),
        ("A", "C"),
        ("C", "Y"),
        ("U", "B"),
        ("C", "B"),
        ("X", "Y"),
    ],
)

pgm = daft.PGM()
coordinates = {
    "U": (0, 2),
    "C": (4, 2),
    "A": (2, 3),
    "B": (2, 1),
    "X": (0, 0),
    "Y": (4, 0),
}
for node in dag_6_1.dag.nodes:
    pgm.add_node(node, node, *coordinates[node])
for edge in dag_6_1.dag.edges:
    pgm.add_edge(*edge)
pgm.render()

plt.show()
_images/parte_6_106_0.png
all_adjustment_sets = dag_6_1.get_all_backdoor_adjustment_sets("X", "Y")

for s in all_adjustment_sets:
    if all(not t.issubset(s) for t in all_adjustment_sets if t != s):
        if s != {"U"}:
            print(s)
frozenset({'C'})
frozenset({'A'})

R Code 6.30

dag_6_2 = CausalGraphicalModel(
    nodes=['S', 'A', 'M', 'W', 'D'],
    edges=[
        ('S','W'),
        ('S','M'),
        ('S','A'),
        ('A','M'),
        ('A','D'),
        ('M','D'),
        ('W','D'),
    ],
)

# Drawing the DAG
pgm = daft.PGM()
coordinates = {
    "S": (0, 2),
    "A": (0, 0),
    "M": (1, 1),
    "W": (2, 2),
    "D": (2, 0),
}
for node in dag_6_2.dag.nodes:
    pgm.add_node(node, node, *coordinates[node])
for edge in dag_6_2.dag.edges:
    pgm.add_edge(*edge)
pgm.render()

plt.show()
_images/parte_6_109_0.png
# R Code 6.30

all_adjustment_sets = dag_6_2.get_all_backdoor_adjustment_sets("W", "D")


for s in all_adjustment_sets:
    if all(not t.issubset(s) for t in all_adjustment_sets if t != s):
        print(s)
frozenset({'A', 'M'})
frozenset({'S'})

R Code 6.31

Reference: Fehiepsi - Numpyro

all_independencies = dag_6_2.get_all_independence_relationships()

for s in all_independencies:
    if all(
        t[0] != s[0] or t[1] != s[1] or not t[2].issubset(s[2])
        for t in all_independencies
        if t != s
    ):
        print(s)
('M', 'W', {'S'})
('W', 'A', {'S'})
('S', 'D', {'W', 'A', 'M'})