6 - Os DAGs assombrados & O Terror Causal¶
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from videpy import Vide
import networkx as nx
# from causalgraphicalmodels import CausalGraphicalModel
import stan
import nest_asyncio
plt.style.use('default')
plt.rcParams['axes.facecolor'] = 'lightgray'
# To DAG's
import daft
from causalgraphicalmodels import CausalGraphicalModel
# To running the stan in jupyter notebook
nest_asyncio.apply()
RCode 6.1 - Pag 162¶
np.random.seed(1914)
N = 200
p = 0.1
# Não correlacionado noticiabilidade(newsworthiness) e confiabilidade(trustworthiness)
nw = np.random.normal(0, 1, N)
tw = np.random.normal(0, 1, N)
# Selecionando os 10% melhores
s = nw + tw # Score total
q = np.quantile(s, 1-p) # Top 10%
selected = [ True if s_i >= q else False for s_i in s ]
print('Noticiabilidade(newsworthiness): \n\n', nw[selected], '\n\n')
print('Confiabilidade(trustworthiness):\n\n', tw[selected], '\n\n')
print('Correlação: ', np.correlate(tw[selected], nw[selected]))
Noticiabilidade(newsworthiness):
[ 0.82456357 1.85614543 0.85556981 1.4066898 1.6026727 1.42256068
0.71166144 1.30222516 0.56454812 3.02039213 0.93879118 1.04202561
-0.4640972 0.12272254 1.99579665 1.59807671 1.82853378 1.15394068
1.48966066 1.22640255]
Confiabilidade(trustworthiness):
[ 1.24510334 1.29709756 2.10293057 0.66088341 1.76728533 0.64448519
1.17598088 0.35356094 1.08805257 -0.72327249 1.31462133 1.54253157
2.8128998 1.51862223 1.18048237 2.31948695 -0.09763283 0.87205345
0.74979249 0.46329186]
Correlação: [19.93737242]
plt.figure(figsize=(17, 6))
plt.scatter(tw, nw, s=6, color='gray')
plt.scatter(tw[selected], nw[selected], s=7, color='blue')
plt.title('Figure 6.1 - Pag162')
plt.xlabel('Noticiabilidade ($newsworthiness$)')
plt.ylabel('Confiabilidade ($trustworthiness$)')
plt.grid(ls='--', color='white', alpha=0.3)
RCode 6.2 - pag163¶
N = 100
np.random.seed(909) # Teste com outras sementes
height = np.random.normal(10, 2, N)
leg_proportion = np.random.uniform(0.4, 0.5, N)
leg_left = np.random.left = leg_proportion * height + np.random.normal(0, 0.02, N)
leg_right = np.random.left = leg_proportion * height + np.random.normal(0, 0.02, N)
df = pd.DataFrame({'height': height,
'leg_left': leg_left,
'leg_right': leg_right})
df.head()
height | leg_left | leg_right | |
---|---|---|---|
0 | 8.463728 | 4.094675 | 4.078446 |
1 | 9.854070 | 4.776475 | 4.687749 |
2 | 8.668694 | 4.192607 | 4.256472 |
3 | 7.523768 | 3.088674 | 3.088206 |
4 | 9.381352 | 4.093217 | 4.048181 |
plt.figure(figsize=(17, 6))
plt.scatter(leg_right, leg_left, s=5, alpha=0.3)
plt.title('Dados dos legs')
plt.xlabel('Leg Right')
plt.ylabel('Leg Left')
plt.grid(ls='--', color='white', alpha=0.4)
plt.show()
plt.figure(figsize=(17, 6))
plt.hist(df.height, rwidth=0.9, density=True)
plt.grid(ls='--', color='white', alpha=0.4)
plt.show()
plt.figure(figsize=(17, 6))
plt.hist(df.leg_left, rwidth=0.9, density=True, alpha=0.5)
plt.hist(df.leg_right, rwidth=0.9, density=True, alpha=0.5)
plt.grid(ls='--', color='white', alpha=0.4)
plt.show()
model = """
data {
int<lower=0> N;
vector[N] height;
vector[N] leg_left;
vector[N] leg_right;
}
parameters {
real alpha;
real beta_left;
real beta_right;
real<lower=0> sigma;
}
model {
alpha ~ normal(10, 100);
beta_left ~ normal(2, 10);
beta_right ~ normal(2, 10);
sigma ~ exponential(1);
height ~ normal(alpha + beta_left * leg_left + beta_right * leg_right, sigma);
}
"""
data = {
'N': N,
'height': height,
'leg_left': leg_left,
'leg_right': leg_right
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
alpha = samples['alpha'].flatten()
beta_left = samples['beta_left'].flatten()
beta_right = samples['beta_right'].flatten()
sigma = samples['sigma'].flatten()
RCode 6.4 - pag164¶
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 0.94 | 0.33 | 0.29 | 1.48 |
beta_left | -1.38 | 2.08 | -5.23 | 2.34 |
beta_right | 3.38 | 2.06 | -0.21 | 7.31 |
sigma | 0.66 | 0.05 | 0.58 | 0.74 |
Vide.plot_forest(samples, title='Leg right & Leg left')
RCode 6.5 - pag164¶
plt.figure(figsize=(17, 6))
plt.scatter(beta_right, beta_left, s=5, alpha=0.3)
plt.title('Posteriori Beta legs')
plt.xlabel('Beta Right')
plt.ylabel('Beta Left')
plt.grid(ls='--', color='white', alpha=0.4)
plt.show()
RCode 6.6 - pag 165¶
plt.figure(figsize=(17, 6))
plt.hist((beta_left + beta_right), density=True, alpha=0.8, bins=100, rwidth=0.9)
plt.title('Posteriori')
plt.xlabel('Soma de Beta_left e beta_right')
plt.ylabel('Densidade')
plt.grid(ls='--', color='white', alpha=0.4)
plt.show()
# Comparação com os beta indivíduais
plt.figure(figsize=(17, 6))
plt.hist(beta_left, density=True, alpha=0.8, bins=100, rwidth=0.9) # Beta left
plt.hist(beta_right, density=True, alpha=0.8, bins=100, rwidth=0.9) # Beta Right
plt.title('Comparativo entre as Posterioris de Beta Left e Beta Right')
plt.xlabel('Betas')
plt.ylabel('Densidade')
plt.grid(ls='--', color='white', alpha=0.1)
plt.show()
RCode 6.7 - Pag 166¶
model = """
data {
int N;
vector[N] leg_left;
vector[N] height;
}
parameters {
real alpha;
real beta_left;
real sigma;
}
model {
alpha ~ normal(10, 100);
beta_left ~ normal(2, 10);
sigma ~ exponential(1);
height ~ normal(alpha + beta_left * leg_left, sigma);
}
"""
data = {
'N': len(height),
'leg_left': leg_left,
'height': height,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
alpha = samples['alpha'].flatten()
beta_left = samples['beta_left'].flatten()
sigma = samples['sigma'].flatten()
# RCode 6.7 - Continuação
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 0.83 | 0.33 | 0.28 | 1.43 |
beta_left | 2.02 | 0.07 | 1.90 | 2.14 |
sigma | 0.67 | 0.05 | 0.58 | 0.75 |
Vide.plot_forest(samples, title='Leg Left')
R Code 6.8¶
df = pd.read_csv('data/milk.csv', sep=';')
df_std = df[['kcal.per.g', 'perc.fat', 'perc.lactose']].copy()
df_std['kcal.per.g'] = (df_std['kcal.per.g'] - df_std['kcal.per.g'].mean()) / df_std['kcal.per.g'].std()
df_std['perc.fat'] = (df_std['perc.fat'] - df_std['perc.fat'].mean()) / df_std['perc.fat'].std()
df_std['perc.lactose'] = (df_std['perc.lactose'] - df_std['perc.lactose'].mean()) / df_std['perc.lactose'].std()
df_std.head()
kcal.per.g | perc.fat | perc.lactose | |
---|---|---|---|
0 | -0.940041 | -1.217243 | 1.307262 |
1 | -0.816126 | -1.030355 | 1.011285 |
2 | -1.125913 | -1.391531 | 1.382679 |
3 | -1.001998 | -1.335535 | 1.586874 |
4 | -0.258511 | -0.469693 | 0.257115 |
# Não tem nenhum 'missing values'
df_std.isna().sum()
kcal.per.g 0
perc.fat 0
perc.lactose 0
dtype: int64
R Code 6.9 - Pag 167¶
# kcal.per.g regredido em perc.fat
model_kf = """
data {
int N;
vector[N] outcome;
vector[N] predictor;
}
parameters {
real alpha;
real beta;
real<lower=0> sigma;
}
model {
alpha ~ normal(0, 0.2);
beta ~ normal(0, 0.5);
sigma ~ exponential(1);
outcome ~ normal(alpha + beta * predictor, sigma);
}
"""
data_kf = {
'N': len(df_std['kcal.per.g']),
'outcome': list(df_std['kcal.per.g'].values),
'predictor': list(df_std['perc.fat'].values),
}
posteriori_kf = stan.build(model_kf, data=data_kf)
samples_kf = posteriori_kf.sample(num_chains=4, num_samples=1000)
# kcal.per.g regredido em perc.lactose
model_kl = """
data {
int N;
vector[N] outcome;
vector[N] predictor;
}
parameters {
real alpha;
real beta;
real<lower=0> sigma;
}
model {
alpha ~ normal(0, 0.2);
beta ~ normal(0, 0.5);
sigma ~ exponential(1);
outcome ~ normal(alpha + beta * predictor, sigma);
}
"""
data_kl = {
'N': len(df_std['kcal.per.g']),
'outcome': df_std['kcal.per.g'].values,
'predictor': df_std['perc.lactose'].values,
}
posteriori_kl = stan.build(model_kl, data=data_kl)
samples_kl = posteriori_kl.sample(num_chains=4, num_samples=1000)
Vide.summary(samples_kf)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 0.00 | 0.08 | -0.14 | 0.16 |
beta | 0.86 | 0.09 | 0.68 | 1.02 |
sigma | 0.49 | 0.07 | 0.37 | 0.62 |
Vide.plot_forest(samples_kf, title='perc.fat')
Vide.summary(samples_kl)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 0.00 | 0.07 | -0.12 | 0.14 |
beta | -0.90 | 0.08 | -1.05 | -0.77 |
sigma | 0.41 | 0.06 | 0.31 | 0.52 |
Vide.plot_forest(samples_kl, title='perc.lactose')
R Code 6.10 - pag 167¶
model = """
data {
int N;
vector[N] F; // Fat
vector[N] L; // Lactose
vector[N] K; // kcal/g
}
parameters {
real alpha;
real bF;
real bL;
real sigma;
}
model {
alpha ~ normal(0, 0.2);
bF ~ normal(0, 0.5);
bL ~ normal(0, 0.5);
sigma ~ exponential(1);
K ~ normal(alpha + bF*F + bL*L, sigma);
}
"""
data = {
'N': len(df_std['kcal.per.g']),
'F': df_std['perc.fat'].values,
'L': df_std['perc.lactose'].values,
'K': df_std['kcal.per.g'].values,
}
posteriori_FL = stan.build(model, data=data)
samples_FL = posteriori_FL.sample(num_chains=4, num_samples=1000)
Vide.summary(samples_FL)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | -0.00 | 0.07 | -0.13 | 0.12 |
bF | 0.25 | 0.19 | -0.11 | 0.58 |
bL | -0.66 | 0.19 | -1.00 | -0.31 |
sigma | 0.41 | 0.06 | 0.31 | 0.51 |
Vide.plot_forest(samples_FL, title='Perc.Fat & Perc.Lactose')
R Code 6.11 - Pag 168 - Figure 6.3¶
pd.plotting.scatter_matrix(df_std, diagonal='hist', grid=True, figsize=(17, 6))
plt.show()
R Code 6.12 - Overthinking - Rever¶
model = """
data {
int N;
vector[N] kcal_per_g;
vector[N] perc_fat;
vector[N] new_predictor_X;
}
parameters {
real alpha;
real bF;
real bX;
real<lower=0> sigma;
}
model {
kcal_per_g ~ normal(alpha + bF * perc_fat + bX * new_predictor_X, sigma);
}
"""
def generate_predictor_x(r=0.9):
N = len(df['perc.fat'].values)
mean = r * df['perc.fat'].values
sd = np.sqrt((1 - r**2) * np.var(df['perc.fat'].values))
return np.random.normal(mean, sd, N) # New Predictor X
def generate_data_dict(r=0.9):
data = {
'N': len(df['kcal.per.g']),
'kcal_per_g': df['kcal.per.g'].values,
'perc_fat': df['perc.fat'].values,
'new_predictor_X': generate_predictor_x(r=r),
}
return data
def adjust_model(r=0.9):
parameter_mean_samples = []
for _ in range(1): # In book running 100x
# Runnning the model
posteriori = stan.build(model, data=generate_data_dict(r=r))
samples = posteriori.sample(num_chains=4, num_samples=1000)
# Get parameter slope mean
parameter_mean_samples.append(samples['bF'].flatten().mean())
return parameter_mean_samples
stddev = []
r_sequence = np.arange(0, 0.99, 0.1) # In book using 0.01
for r in r_sequence:
parameter = adjust_model(r=r)
stddev.append(np.mean(parameter))
plt.figure(figsize=(17, 6))
plt.plot(r_sequence, stddev)
plt.xlabel("correlation", fontsize=14)
plt.ylabel("stddev", fontsize=14)
plt.show()
R Code 6.13¶
np.random.seed(3)
# Quantidade de plantas
N = 100
# Simulação inicial das alturas
h0 = np.random.normal(10, 2, N)
# Atribuindo tratamentos e simulando fungos e tratamentos
treatment = np.repeat([0,1], repeats=int(N/2))
fungus = np.random.binomial(n=1, p=(0.5 - treatment*0.4), size=N)
h1 = h0 + np.random.normal(5 - 3*fungus, 1, N)
# Dataframe
d = pd.DataFrame.from_dict({'h0': h0,
'h1': h1,
'treatment': treatment,
'fungus': fungus})
d.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
h0 | 100.0 | 9.782726 | 2.138707 | 4.168524 | 8.279903 | 9.642072 | 11.353860 | 14.316299 |
h1 | 100.0 | 14.209396 | 2.766929 | 6.881795 | 12.512818 | 14.190085 | 15.688766 | 20.786965 |
treatment | 100.0 | 0.500000 | 0.502519 | 0.000000 | 0.000000 | 0.500000 | 1.000000 | 1.000000 |
fungus | 100.0 | 0.270000 | 0.446196 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
R Code 6.14¶
sim_p = np.random.lognormal(0, 0.25, int(1e4))
pd.DataFrame(sim_p, columns=['sim_p']).describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
sim_p | 10000.0 | 1.02366 | 0.259148 | 0.391611 | 0.83898 | 0.993265 | 1.174575 | 2.781105 |
R Code 6.15¶
Modelo:
\[ h_{1,i} \sim Normal(\mu_i, \sigma) \]
\[ \mu_i = h_{0, i} \times p \]
Prioris:
\[ p \sim LogNormal(0, 0.25) \]
\[ sigma \sim Exponential(1) \]
model = """
data {
int N;
vector[N] h1;
vector[N] h0;
}
parameters {
real<lower=0> p;
real<lower=0> sigma;
}
model {
vector[N] mu;
mu = h0 * p;
h1 ~ normal(mu, sigma);
// Prioris
p ~ lognormal(0, 0.25);
sigma ~ exponential(1);
}
"""
data = {
'N': N,
'h1': h1,
'h0': h0,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
p | 1.43 | 0.02 | 1.40 | 1.46 |
sigma | 1.99 | 0.15 | 1.72 | 2.25 |
RCode 6.16¶
Modelo post-treatment bias:
\[ h_{1, i} \sim Normal(\mu_i, \sigma) \]
\[ \mu_i = h_{0, i} \times p \]
\[ p = \alpha + \beta_T T_i + \beta_F F_i \]
prioris:
\[ \alpha \sim LogNormal(0, 0.25) \]
\[ \beta_T \sim Normal(0, 0.5) \]
\[ \beta_F \sim Normal(0, 0.5) \]
\[ \sigma \sim Exponential(1) \]
"""
To mu definition below
----------------------
vector[N] a;
vector[N] b;
vector[N] c;
These operation:
c = a .* b;
Is the same operation:
for (n in 1:N) {
c[n] = a[n] * b[n];
}
Reference:
https://mc-stan.org/docs/reference-manual/arithmetic-expressions.html
"""
model = """
data {
int N;
vector[N] h0;
vector[N] h1;
vector[N] T; // Treatment
vector[N] F; // Fungus
}
parameters {
real alpha;
real bT;
real bF;
real<lower=0> sigma;
}
model {
vector[N] mu;
vector[N] p;
p = alpha + bT * T + bF * F;
mu = h0 .* p;
// likelihood
h1 ~ normal(mu, sigma);
// prioris
alpha ~ lognormal(0, 0.25);
bT ~ normal(0, 0.5);
bF ~ normal(0, 0.5);
sigma ~ exponential(1);
}
"""
data = {
'N': N,
'h0': h0,
'h1': h1,
'T': treatment,
'F': fungus,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 1.53 | 0.03 | 1.48 | 1.59 |
bT | -0.03 | 0.04 | -0.09 | 0.03 |
bF | -0.32 | 0.04 | -0.39 | -0.25 |
sigma | 1.44 | 0.11 | 1.26 | 1.64 |
Vide.plot_forest(samples, title='Treatment and Fungus')
R Code 6.17¶
model = """
data {
int N;
vector[N] h0;
vector[N] h1;
vector[N] T; // Treatment
}
parameters {
real<lower=0> alpha;
real bT;
real<lower=0> sigma;
}
model {
vector[N] mu;
vector[N] p;
p = alpha + bT * T;
mu = h0 .* p;
h1 ~ normal(mu, sigma);
alpha ~ lognormal(0, 0.2);
bT ~ normal(0, 0.5);
sigma ~ exponential(1);
}
"""
data = {
'N': N,
'h0': h0,
'h1': h1,
'T': treatment,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 1.36 | 0.03 | 1.31 | 1.41 |
bT | 0.13 | 0.04 | 0.06 | 0.20 |
sigma | 1.89 | 0.14 | 1.65 | 2.14 |
Vide.plot_forest(samples, title='Treatment without fungus')
R Code 6.18¶
G = nx.DiGraph()
nodes = {0: '$H_0$',
1: '$H_1$',
2: 'F',
3: 'T'}
for i in nodes:
G.add_node(nodes[i])
edges = [(nodes[0], nodes[1]),
(nodes[2], nodes[1]),
(nodes[3], nodes[2])]
G.add_edges_from(edges)
# explicitly set positions
pos = {nodes[0]: (0, 0),
nodes[1]: (1, 0),
nodes[2]: (1.5, 0),
nodes[3]: (2, 0)}
options = {
"font_size": 15,
"node_size": 400,
"node_color": "white",
"edgecolors": "white",
"linewidths": 1,
"width": 1,
}
nx.draw(G, pos, with_labels=True, **options)
# Set margins for the axes so that nodes aren't clipped
ax = plt.gca()
# ax.margins(0.01)
plt.axis("off")
plt.show()
R Code 6.20¶
# np.random.seed(3)
# Quantidade de plantas
N = 100
# Simulação inicial das alturas
h0 = np.random.normal(10, 2, N)
# Atribuindo tratamentos e simulando fungos e tratamentos
treatment = np.repeat([0, 1], repeats=int(N/2))
M = np.random.binomial(n=1, p=0.5, size=N) # Moisture -> Bernoulli(p=0.5)
fungus = np.random.binomial(n=1, p=(0.5 - treatment * 0.4 + 0.4 * M), size=N)
h1 = h0 + np.random.normal((5 + 3 * M), 1, N)
# Dataframe
d2 = pd.DataFrame.from_dict({'h0': h0,
'h1': h1,
'treatment': treatment,
'fungus': fungus})
d2.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
h0 | 100.0 | 10.091610 | 1.976358 | 6.313914 | 8.850678 | 10.169278 | 11.369749 | 15.273222 |
h1 | 100.0 | 16.510702 | 2.715874 | 10.105379 | 14.711143 | 15.899131 | 18.671828 | 22.662508 |
treatment | 100.0 | 0.500000 | 0.502519 | 0.000000 | 0.000000 | 0.500000 | 1.000000 | 1.000000 |
fungus | 100.0 | 0.480000 | 0.502117 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
# RCode 6.17 with new database
model = """
data {
int N;
vector[N] h0;
vector[N] h1;
vector[N] T; // Treatment
}
parameters {
real<lower=0> alpha;
real bT;
real<lower=0> sigma;
}
model {
vector[N] mu;
vector[N] p;
p = alpha + bT * T;
mu = h0 .* p;
h1 ~ normal(mu, sigma);
alpha ~ lognormal(0, 0.2);
bT ~ normal(0, 0.5);
sigma ~ exponential(1);
}
"""
data = {
'N': N,
'h0': d2.h0.values,
'h1': d2.h1.values,
'T': d2.treatment.values,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 1.60 | 0.03 | 1.55 | 1.65 |
bT | 0.02 | 0.04 | -0.05 | 0.09 |
sigma | 2.16 | 0.15 | 1.87 | 2.43 |
Vide.plot_forest(samples, title="Only Treatment using d2")
# RCode 6.16 with new database
model = """
data {
int N;
vector[N] h0;
vector[N] h1;
vector[N] T; // Treatment
vector[N] F; // Fungus
}
parameters {
real alpha;
real bT;
real bF;
real<lower=0> sigma;
}
model {
vector[N] mu;
vector[N] p;
p = alpha + bT * T + bF * F;
mu = h0 .* p;
// likelihood
h1 ~ normal(mu, sigma);
// prioris
alpha ~ lognormal(0, 0.25);
bT ~ normal(0, 0.5);
bF ~ normal(0, 0.5);
sigma ~ exponential(1);
}
"""
data = {
'N': N,
'h0': d2.h0.values,
'h1': d2.h1.values,
'T': d2.treatment.values,
'F': d2.fungus.values,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 1.53 | 0.04 | 1.46 | 1.60 |
bT | 0.06 | 0.04 | -0.02 | 0.14 |
bF | 0.11 | 0.04 | 0.03 | 0.19 |
sigma | 2.10 | 0.15 | 1.83 | 2.36 |
Vide.plot_forest(samples, title="Treatments and Fungus")
Collider Bias¶
6.21¶
Simulação
Cada ano, \(20\) pessoas nascem com valores de felicidade uniformente distribuídos
Cada ano, cada uma das pessoas envelhece \(1\) ano. A sua felicidade não muda.
Aos \(18\) anos, um indivíduo de casa com a probabilidade de porporcional a sua felicidade.
Uma vez casado, o indivíduo se mantém casado.
Aos 65 anos, o indivíduo deixa a amostra (Vai morar na Espanha)
# Function based in https://github.com/rmcelreath/rethinking/blob/master/R/sim_happiness.R
# Inv_logit R-function in https://stat.ethz.ch/R-manual/R-devel/library/boot/html/inv.logit.html
def inv_logit(x):
return np.exp(x) / (1 + np.exp(x))
def sim_happiness(seed=1977 , N_years=1000 , max_age=65 , N_births=20 , aom=18):
np.random.seed(seed)
df = pd.DataFrame(columns=['age', 'married', 'happiness'])
for i in range(N_years):
# Update age
df['age'] += 1
# Move to Spain when age == max_age
df.drop(df[df['age'] == max_age].index, inplace=True)
# Will marry?
index_unmarried_aom = df.query((f'age>={aom} and married==0')).index.tolist()
weddings = np.random.binomial(1, inv_logit(df.loc[index_unmarried_aom, 'happiness'] - 4))
df.loc[index_unmarried_aom, 'married'] = weddings
# New borns
df_aux = pd.DataFrame(columns=['age', 'married', 'happiness'])
df_aux.loc[:, 'age'] = np.zeros(N_births).astype(int)
df_aux.loc[:, 'married'] = np.zeros(N_births).astype(int)
df_aux.loc[:, 'happiness'] = np.linspace(-2, 2, N_births) # np.random.uniform(0, 1, N_births)
df = df.append(df_aux, ignore_index=True)
return df
df = sim_happiness(seed=1997, N_years=1000)
df.describe(percentiles=[0.055, 0.945], include='all').T
count | unique | top | freq | mean | std | min | 5.5% | 50% | 94.5% | max | |
---|---|---|---|---|---|---|---|---|---|---|---|
age | 1300.0 | 65.0 | 64.0 | 20.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
married | 1300.0 | 2.0 | 0.0 | 950.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
happiness | 1300.0 | NaN | NaN | NaN | -8.335213e-17 | 1.214421 | -2.0 | -1.789474 | -1.110223e-16 | 1.789474 | 2.0 |
# Figure 6.21
plt.figure(figsize=(17, 6))
colors = ['white' if is_married == 0 else 'blue' for is_married in df.married ]
plt.scatter(df.age, df.happiness, color=colors)
plt.title('White - Unmarried | Blue - Married')
plt.xlabel('Age')
plt.ylabel('Happiness')
plt.show()
R Code 6.22¶
df2 = df[df.age > 17].copy() # Only adults
df2.loc[:, 'age'] = (df2.age - 18) / (65 - 18)
R Code 6.23¶
Modelo:
\[ happiness \sim Normal(\mu_i, \sigma) \]
\[ \mu_i = \alpha_{_{MID}[i]} + \beta_A \times A_i \]
prioris:
\[ \alpha_{_{MID}[i]} \sim Normal(0, 1)\]
\[ \beta_A \sim Normal(0, 2) \]
\[ \sigma \sim Exponential(1); \]
model = """
data {
int N;
vector[N] age;
vector[N] happiness;
array[N] int married; // Must be integer because this is index to alpha.
}
parameters {
vector[2] alpha; // can also be written like this: real alpha[2] or array[2] int alpha;
real beta_age;
real<lower=0> sigma;
}
model {
vector[N] mu;
for (i in 1:N){
mu[i] = alpha[ married[i] ] + beta_age * age[i];
}
happiness ~ normal(mu, sigma);
// Prioris
alpha ~ normal(0, 1);
beta_age ~normal(0, 2);
sigma ~ exponential(1);
}
"""
data = {
'N': len(df2.happiness.values),
'age': df2.age.values,
'happiness': df2.happiness.values,
'married': df2.married.values + 1 # Because the index in stan starting with 1
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha[0] | -0.19 | 0.06 | -0.30 | -0.07 |
alpha[1] | 1.38 | 0.09 | 1.22 | 1.53 |
beta_age | -0.81 | 0.11 | -1.02 | -0.62 |
sigma | 0.97 | 0.02 | 0.93 | 1.02 |
Vide.plot_forest(samples, title='Married and Unmarried')
R Code 6.24¶
model = """
data {
int N;
vector[N] age;
vector[N] happiness;
}
parameters {
real alpha;
real beta_age;
real<lower=0> sigma;
}
model {
vector[N] mu;
for (i in 1:N){
mu[i] = alpha + beta_age * age[i];
}
happiness ~ normal(mu, sigma);
alpha ~ normal(0, 1);
beta_age ~ normal(0, 2);
sigma ~ exponential(1);
}
"""
data = {
'N': len(df2.happiness.values),
'happiness': df2.happiness.values,
'age': df2.age.values,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | -0.00 | 0.08 | -0.15 | 0.14 |
beta_age | 0.00 | 0.14 | -0.23 | 0.27 |
sigma | 1.22 | 0.03 | 1.17 | 1.27 |
Vide.plot_forest(samples, title='Without married variable')
RCode 6.25¶
N = 200 # Qty of triads (G, P, C)
b_GP = 1 # Direct effect of G on P
b_GC = 0 # Direct effect of G on C
b_PC = 1 # Direct effect of P on C
b_U = 2 # Direct effect of U on P and C
R Code 6.26¶
np.random.seed(3)
U = 2 * np.random.binomial(n=1, p=0.5, size=N) - 1 # {-1, 1}
# U = np.random.normal(0, 1, N) # Simulation more realistic example
G = np.random.normal(0, 1, size=N) # Has not influence
P = np.random.normal(b_GP*G + b_U*U, 1, size=N)
C = np.random.normal(b_PC*P + b_GC*G + b_U*U, 1, size=N)
d = pd.DataFrame.from_dict({'C':C, 'P':P, 'G':G, 'U':U})
d.head()
C | P | G | U | |
---|---|---|---|---|
0 | 0.986995 | 1.197787 | -0.795915 | 1 |
1 | 5.715446 | 3.573636 | 0.072746 | 1 |
2 | -4.387763 | -3.131400 | -0.261240 | -1 |
3 | 2.598454 | 0.462321 | -1.298047 | 1 |
4 | 6.614388 | 4.931193 | 2.676112 | 1 |
R Code 6.27¶
# C ~ P + G
model = """
data {
int N;
vector[N] C;
vector[N] P;
vector[N] G;
}
parameters {
real alpha;
real b_PC;
real b_GC;
real<lower=0> sigma;
}
model {
vector[N] mu;
for (i in 1:N){
mu[i] = alpha + b_PC*P[i] + b_GC*G[i];
}
C ~ normal(mu, sigma);
alpha ~ normal(0, 1);
b_PC ~ normal(0, 1);
b_GC ~ normal(0, 1);
sigma ~ exponential(1);
}
"""
data = {
'N': len(d.C.values),
'C': d.C.values,
'P': d.P.values,
'G': d.G.values,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 0.07 | 0.10 | -0.10 | 0.24 |
b_PC | 1.77 | 0.04 | 1.70 | 1.84 |
b_GC | -0.82 | 0.10 | -1.01 | -0.65 |
sigma | 1.33 | 0.07 | 1.20 | 1.45 |
Vide.plot_forest(samples)
# Figure 6.5
plt.figure(figsize=(17, 6))
colors = ['black' if u <= 0 else 'blue' for u in U] # Unobserved
x = np.linspace(-3, 4)
y = np.mean(samples['alpha']) + np.mean(samples['b_GC']) * x
plt.plot(x, y, c='k')
plt.scatter(G, C, c=colors)
plt.xlabel('Granparent Education (G)')
plt.ylabel('GranChild Education (C)')
plt.title('Educations - Bias Collider')
plt.show()
R Code 6.28¶
model = """
data {
int N;
vector[N] C;
vector[N] P;
vector[N] G;
vector[N] U;
}
parameters {
real alpha;
real b_PC;
real b_GC;
real b_U;
real<lower=0> sigma;
}
model {
vector[N] mu;
for (i in 1:N){
mu[i] = alpha + b_PC*P[i] + b_GC*G[i] + b_U*U[i];
}
C ~ normal(mu, sigma);
alpha ~ normal(0, 1);
b_GC ~ normal(0, 1);
b_PC ~ normal(0, 1);
b_U ~ normal(0, 1);
sigma ~ exponential(1);
}
"""
data = {
'N': len(d.C.values),
'C': d.C.values,
'P': d.P.values,
'G': d.G.values,
'U': d.U.values,
}
posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)
Vide.summary(samples)
mean | std | 7.0% | 93.0% | |
---|---|---|---|---|
alpha | 0.07 | 0.07 | -0.06 | 0.21 |
b_PC | 1.00 | 0.07 | 0.86 | 1.12 |
b_GC | -0.03 | 0.10 | -0.22 | 0.15 |
b_U | 1.97 | 0.17 | 1.68 | 2.27 |
sigma | 1.02 | 0.05 | 0.93 | 1.11 |
Vide.plot_forest(samples)
R Code 6.29¶
Reference: ksachdeva
dag_6_1 = CausalGraphicalModel(
nodes=["C", "U", "B", "A", "X", "Y"],
edges=[
("U", "X"),
("A", "U"),
("A", "C"),
("C", "Y"),
("U", "B"),
("C", "B"),
("X", "Y"),
],
)
pgm = daft.PGM()
coordinates = {
"U": (0, 2),
"C": (4, 2),
"A": (2, 3),
"B": (2, 1),
"X": (0, 0),
"Y": (4, 0),
}
for node in dag_6_1.dag.nodes:
pgm.add_node(node, node, *coordinates[node])
for edge in dag_6_1.dag.edges:
pgm.add_edge(*edge)
pgm.render()
plt.show()
all_adjustment_sets = dag_6_1.get_all_backdoor_adjustment_sets("X", "Y")
for s in all_adjustment_sets:
if all(not t.issubset(s) for t in all_adjustment_sets if t != s):
if s != {"U"}:
print(s)
frozenset({'C'})
frozenset({'A'})
R Code 6.30¶
dag_6_2 = CausalGraphicalModel(
nodes=['S', 'A', 'M', 'W', 'D'],
edges=[
('S','W'),
('S','M'),
('S','A'),
('A','M'),
('A','D'),
('M','D'),
('W','D'),
],
)
# Drawing the DAG
pgm = daft.PGM()
coordinates = {
"S": (0, 2),
"A": (0, 0),
"M": (1, 1),
"W": (2, 2),
"D": (2, 0),
}
for node in dag_6_2.dag.nodes:
pgm.add_node(node, node, *coordinates[node])
for edge in dag_6_2.dag.edges:
pgm.add_edge(*edge)
pgm.render()
plt.show()
# R Code 6.30
all_adjustment_sets = dag_6_2.get_all_backdoor_adjustment_sets("W", "D")
for s in all_adjustment_sets:
if all(not t.issubset(s) for t in all_adjustment_sets if t != s):
print(s)
frozenset({'A', 'M'})
frozenset({'S'})
R Code 6.31¶
Reference: Fehiepsi - Numpyro
all_independencies = dag_6_2.get_all_independence_relationships()
for s in all_independencies:
if all(
t[0] != s[0] or t[1] != s[1] or not t[2].issubset(s[2])
for t in all_independencies
if t != s
):
print(s)
('M', 'W', {'S'})
('W', 'A', {'S'})
('S', 'D', {'W', 'A', 'M'})