6 - Os DAGs assombrados & O Terror Causal¶

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from videpy import Vide

import networkx as nx
# from causalgraphicalmodels import CausalGraphicalModel

import stan
import nest_asyncio

plt.style.use('default')

plt.rcParams['axes.facecolor'] = 'lightgray'

# To DAG's
import daft
from causalgraphicalmodels import CausalGraphicalModel

# To running the stan in jupyter notebook
nest_asyncio.apply()

RCode 6.1 - Pag 162¶

np.random.seed(1914)

N = 200
p = 0.1

# Não correlacionado noticiabilidade(newsworthiness) e confiabilidade(trustworthiness)
nw = np.random.normal(0, 1, N)
tw = np.random.normal(0, 1, N)

# Selecionando os 10% melhores
s = nw + tw  # Score total
q = np.quantile(s, 1-p)  # Top 10% 
selected = [ True if s_i >= q else False for s_i in s ]
print('Noticiabilidade(newsworthiness): \n\n', nw[selected], '\n\n')
print('Confiabilidade(trustworthiness):\n\n', tw[selected], '\n\n')

print('Correlação: ', np.correlate(tw[selected], nw[selected]))

Noticiabilidade(newsworthiness): 

 [ 0.82456357  1.85614543  0.85556981  1.4066898   1.6026727   1.42256068
  0.71166144  1.30222516  0.56454812  3.02039213  0.93879118  1.04202561
 -0.4640972   0.12272254  1.99579665  1.59807671  1.82853378  1.15394068
  1.48966066  1.22640255] 


Confiabilidade(trustworthiness):

 [ 1.24510334  1.29709756  2.10293057  0.66088341  1.76728533  0.64448519
  1.17598088  0.35356094  1.08805257 -0.72327249  1.31462133  1.54253157
  2.8128998   1.51862223  1.18048237  2.31948695 -0.09763283  0.87205345
  0.74979249  0.46329186] 


Correlação:  [19.93737242]

plt.figure(figsize=(17, 6))

plt.scatter(tw, nw, s=6, color='gray')
plt.scatter(tw[selected], nw[selected], s=7, color='blue')

plt.title('Figure 6.1 - Pag162')
plt.xlabel('Noticiabilidade ($newsworthiness$)')
plt.ylabel('Confiabilidade ($trustworthiness$)')

plt.grid(ls='--', color='white', alpha=0.3)

RCode 6.2 - pag163¶

N = 100

np.random.seed(909)  # Teste com outras sementes

height = np.random.normal(10, 2,  N)

leg_proportion = np.random.uniform(0.4, 0.5, N)

leg_left  = np.random.left = leg_proportion * height + np.random.normal(0, 0.02, N)
leg_right = np.random.left = leg_proportion * height + np.random.normal(0, 0.02, N)


df = pd.DataFrame({'height': height, 
                   'leg_left': leg_left, 
                   'leg_right': leg_right})
df.head()

	height	leg_left	leg_right
0	8.463728	4.094675	4.078446
1	9.854070	4.776475	4.687749
2	8.668694	4.192607	4.256472
3	7.523768	3.088674	3.088206
4	9.381352	4.093217	4.048181

plt.figure(figsize=(17, 6))

plt.scatter(leg_right, leg_left, s=5, alpha=0.3)

plt.title('Dados dos legs')
plt.xlabel('Leg Right')
plt.ylabel('Leg Left')

plt.grid(ls='--', color='white', alpha=0.4)

plt.show()

plt.figure(figsize=(17, 6))
plt.hist(df.height, rwidth=0.9, density=True)
plt.grid(ls='--', color='white', alpha=0.4)
plt.show()

plt.figure(figsize=(17, 6))
plt.hist(df.leg_left, rwidth=0.9, density=True, alpha=0.5)
plt.hist(df.leg_right, rwidth=0.9, density=True, alpha=0.5)

plt.grid(ls='--', color='white', alpha=0.4)
plt.show()

model = """
    data {
        int<lower=0> N;
        vector[N] height;
        vector[N] leg_left;
        vector[N] leg_right;
    }
    
    parameters {
        real alpha;
        real beta_left;
        real beta_right;
        real<lower=0> sigma; 
    }
    
    model {
        alpha ~ normal(10, 100);
        beta_left ~ normal(2, 10);
        beta_right ~ normal(2, 10);
        sigma ~ exponential(1);
        
        height ~ normal(alpha + beta_left * leg_left + beta_right * leg_right, sigma);
    }
"""

data = {
    'N': N,
    'height': height,
    'leg_left': leg_left,
    'leg_right': leg_right
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

alpha = samples['alpha'].flatten()
beta_left = samples['beta_left'].flatten()
beta_right = samples['beta_right'].flatten()
sigma = samples['sigma'].flatten()

RCode 6.4 - pag164¶

Vide.summary(samples)

	mean	std	7.0%	93.0%
alpha	0.94	0.33	0.29	1.48
beta_left	-1.38	2.08	-5.23	2.34
beta_right	3.38	2.06	-0.21	7.31
sigma	0.66	0.05	0.58	0.74

Vide.plot_forest(samples, title='Leg right & Leg left')

RCode 6.5 - pag164¶

plt.figure(figsize=(17, 6))

plt.scatter(beta_right, beta_left, s=5, alpha=0.3)

plt.title('Posteriori Beta legs')
plt.xlabel('Beta Right')
plt.ylabel('Beta Left')

plt.grid(ls='--', color='white', alpha=0.4)

plt.show()

RCode 6.6 - pag 165¶

plt.figure(figsize=(17, 6))

plt.hist((beta_left + beta_right), density=True, alpha=0.8, bins=100, rwidth=0.9)

plt.title('Posteriori')
plt.xlabel('Soma de Beta_left e beta_right')
plt.ylabel('Densidade')

plt.grid(ls='--', color='white', alpha=0.4)

plt.show()

# Comparação com os beta indivíduais
plt.figure(figsize=(17, 6))

plt.hist(beta_left, density=True, alpha=0.8, bins=100, rwidth=0.9)  # Beta left

plt.hist(beta_right, density=True, alpha=0.8, bins=100, rwidth=0.9)  # Beta Right

plt.title('Comparativo entre as Posterioris de Beta Left e Beta Right')
plt.xlabel('Betas')
plt.ylabel('Densidade')

plt.grid(ls='--', color='white', alpha=0.1)

plt.show()

RCode 6.7 - Pag 166¶

model = """
    data {
        int N;
        vector[N] leg_left;
        vector[N] height;
    }
    
    parameters {
        real alpha;
        real beta_left;
        real sigma;
    }
    
    model {
        alpha ~ normal(10, 100);
        beta_left ~ normal(2, 10);
        sigma ~ exponential(1);
        
        height ~ normal(alpha + beta_left * leg_left, sigma);
    }
"""

data = {
    'N': len(height),
    'leg_left': leg_left,
    'height': height,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

alpha = samples['alpha'].flatten()
beta_left = samples['beta_left'].flatten()
sigma = samples['sigma'].flatten()

# RCode 6.7 - Continuação
Vide.summary(samples)

	mean	std	7.0%	93.0%
alpha	0.83	0.33	0.28	1.43
beta_left	2.02	0.07	1.90	2.14
sigma	0.67	0.05	0.58	0.75

Vide.plot_forest(samples, title='Leg Left')

R Code 6.8¶

df = pd.read_csv('data/milk.csv', sep=';')

df_std = df[['kcal.per.g', 'perc.fat', 'perc.lactose']].copy()

df_std['kcal.per.g'] = (df_std['kcal.per.g'] - df_std['kcal.per.g'].mean()) / df_std['kcal.per.g'].std()
df_std['perc.fat'] = (df_std['perc.fat'] - df_std['perc.fat'].mean()) / df_std['perc.fat'].std()
df_std['perc.lactose'] = (df_std['perc.lactose'] - df_std['perc.lactose'].mean()) / df_std['perc.lactose'].std()

df_std.head()

	kcal.per.g	perc.fat	perc.lactose
0	-0.940041	-1.217243	1.307262
1	-0.816126	-1.030355	1.011285
2	-1.125913	-1.391531	1.382679
3	-1.001998	-1.335535	1.586874
4	-0.258511	-0.469693	0.257115

# Não tem nenhum 'missing values'

df_std.isna().sum()

kcal.per.g      0
perc.fat        0
perc.lactose    0
dtype: int64

R Code 6.9 - Pag 167¶

# kcal.per.g  regredido em perc.fat

model_kf = """
    data {
        int N;
        vector[N] outcome;
        vector[N] predictor;
    }
    
    parameters {
        real alpha;
        real beta;
        real<lower=0> sigma;
    }
    
    model {
        alpha ~ normal(0, 0.2);
        beta ~ normal(0, 0.5);
        sigma ~ exponential(1);
        
        outcome ~ normal(alpha + beta * predictor, sigma);
    }
"""

data_kf = {
    'N': len(df_std['kcal.per.g']),
    'outcome': list(df_std['kcal.per.g'].values),
    'predictor': list(df_std['perc.fat'].values),
}

posteriori_kf = stan.build(model_kf, data=data_kf)
samples_kf = posteriori_kf.sample(num_chains=4, num_samples=1000)

# kcal.per.g  regredido em  perc.lactose

model_kl = """
    data {
        int N;
        vector[N] outcome;
        vector[N] predictor;
    }
    
    parameters {
        real alpha;
        real beta;
        real<lower=0> sigma;
    }
    
    model {
        alpha ~ normal(0, 0.2);
        beta ~ normal(0, 0.5);
        sigma ~ exponential(1);
        
        outcome ~ normal(alpha + beta * predictor, sigma);
    }
"""

data_kl = {
    'N': len(df_std['kcal.per.g']),
    'outcome': df_std['kcal.per.g'].values,
    'predictor': df_std['perc.lactose'].values,
}

posteriori_kl = stan.build(model_kl, data=data_kl)
samples_kl = posteriori_kl.sample(num_chains=4, num_samples=1000)

Vide.summary(samples_kf)

	mean	std	7.0%	93.0%
alpha	0.00	0.08	-0.14	0.16
beta	0.86	0.09	0.68	1.02
sigma	0.49	0.07	0.37	0.62

Vide.plot_forest(samples_kf, title='perc.fat')

Vide.summary(samples_kl)

	mean	std	7.0%	93.0%
alpha	0.00	0.07	-0.12	0.14
beta	-0.90	0.08	-1.05	-0.77
sigma	0.41	0.06	0.31	0.52

Vide.plot_forest(samples_kl, title='perc.lactose')

R Code 6.10 - pag 167¶

model = """
    data {
        int N;
        vector[N] F;  // Fat
        vector[N] L;  // Lactose
        vector[N] K;  // kcal/g
    }
    
    parameters {
        real alpha;
        real bF;
        real bL;
        real sigma;
    }
    
    model {
        alpha ~ normal(0, 0.2);
        bF ~ normal(0, 0.5);
        bL ~ normal(0, 0.5);
        sigma ~ exponential(1);
        
        K ~ normal(alpha + bF*F + bL*L, sigma);
    }
"""

data = {
    'N': len(df_std['kcal.per.g']),
    'F': df_std['perc.fat'].values,
    'L': df_std['perc.lactose'].values,
    'K': df_std['kcal.per.g'].values,
}

posteriori_FL = stan.build(model, data=data)
samples_FL = posteriori_FL.sample(num_chains=4, num_samples=1000)

Vide.summary(samples_FL)

	mean	std	7.0%	93.0%
alpha	-0.00	0.07	-0.13	0.12
bF	0.25	0.19	-0.11	0.58
bL	-0.66	0.19	-1.00	-0.31
sigma	0.41	0.06	0.31	0.51

Vide.plot_forest(samples_FL, title='Perc.Fat & Perc.Lactose')

R Code 6.11 - Pag 168 - Figure 6.3¶

pd.plotting.scatter_matrix(df_std, diagonal='hist', grid=True, figsize=(17, 6))
plt.show()

R Code 6.12 - Overthinking - Rever¶

model = """
    data {
        int N;
        vector[N] kcal_per_g;
        vector[N] perc_fat;
        vector[N] new_predictor_X;
    }
    
    parameters {
        real alpha;
        real bF;
        real bX;
        real<lower=0> sigma;
    }
    
    model {
        kcal_per_g ~ normal(alpha + bF * perc_fat + bX * new_predictor_X, sigma);
    }
"""

def generate_predictor_x(r=0.9):
    N = len(df['perc.fat'].values)
    
    mean = r * df['perc.fat'].values
    sd = np.sqrt((1 - r**2) * np.var(df['perc.fat'].values))
    
    return np.random.normal(mean, sd, N)  # New Predictor X

def generate_data_dict(r=0.9):
    data = {
        'N': len(df['kcal.per.g']),
        'kcal_per_g': df['kcal.per.g'].values,
        'perc_fat': df['perc.fat'].values,
        'new_predictor_X': generate_predictor_x(r=r),
    }
    return data

def adjust_model(r=0.9):
    
    parameter_mean_samples  = []
    
    for _ in range(1):  # In book running 100x
        # Runnning the model
        posteriori = stan.build(model, data=generate_data_dict(r=r))
        samples = posteriori.sample(num_chains=4, num_samples=1000)
        
        # Get parameter slope mean
        parameter_mean_samples.append(samples['bF'].flatten().mean())
            
    return parameter_mean_samples

stddev = []
r_sequence = np.arange(0, 0.99, 0.1)  # In book using 0.01

for r in r_sequence:
    parameter = adjust_model(r=r)
    stddev.append(np.mean(parameter))

plt.figure(figsize=(17, 6))

plt.plot(r_sequence, stddev)
plt.xlabel("correlation", fontsize=14)
plt.ylabel("stddev", fontsize=14)
plt.show()

R Code 6.13¶

np.random.seed(3)

# Quantidade de plantas
N = 100

# Simulação inicial das alturas
h0 = np.random.normal(10, 2, N)

# Atribuindo tratamentos e simulando fungos e tratamentos
treatment = np.repeat([0,1], repeats=int(N/2))
fungus = np.random.binomial(n=1, p=(0.5 - treatment*0.4), size=N)
h1 = h0 + np.random.normal(5 - 3*fungus, 1, N)

# Dataframe
d = pd.DataFrame.from_dict({'h0': h0, 
                            'h1': h1, 
                            'treatment': treatment, 
                            'fungus': fungus})
d.describe().T

	count	mean	std	min	25%	50%	75%	max
h0	100.0	9.782726	2.138707	4.168524	8.279903	9.642072	11.353860	14.316299
h1	100.0	14.209396	2.766929	6.881795	12.512818	14.190085	15.688766	20.786965
treatment	100.0	0.500000	0.502519	0.000000	0.000000	0.500000	1.000000	1.000000
fungus	100.0	0.270000	0.446196	0.000000	0.000000	0.000000	1.000000	1.000000

R Code 6.14¶

sim_p = np.random.lognormal(0, 0.25, int(1e4))
pd.DataFrame(sim_p, columns=['sim_p']).describe().T

	count	mean	std	min	25%	50%	75%	max
sim_p	10000.0	1.02366	0.259148	0.391611	0.83898	0.993265	1.174575	2.781105

R Code 6.15¶

Modelo:

\[ h_{1,i} \sim Normal(\mu_i, \sigma) \]

\[ \mu_i = h_{0, i} \times p \]

Prioris:

\[ p \sim LogNormal(0, 0.25) \]

\[ sigma \sim Exponential(1) \]

model = """
    data {
        int N; 
        vector[N] h1;
        vector[N] h0;
    }
    
    parameters {
        real<lower=0> p;
        real<lower=0> sigma;
        
    }
    
    model {
        vector[N] mu;
        mu = h0 * p;
        
        h1 ~ normal(mu, sigma);
        
        // Prioris
        p ~ lognormal(0, 0.25);
        sigma ~ exponential(1); 
    }
"""


data = {
    'N': N,
    'h1': h1,
    'h0': h0,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

Vide.summary(samples)

	mean	std	7.0%	93.0%
p	1.43	0.02	1.40	1.46
sigma	1.99	0.15	1.72	2.25

RCode 6.16¶

Modelo post-treatment bias:

\[ h_{1, i} \sim Normal(\mu_i, \sigma) \]

\[ \mu_i = h_{0, i} \times p \]

\[ p = \alpha + \beta_T T_i + \beta_F F_i \]

prioris:

\[ \alpha \sim LogNormal(0, 0.25) \]

\[ \beta_T \sim Normal(0, 0.5) \]

\[ \beta_F \sim Normal(0, 0.5) \]

\[ \sigma \sim Exponential(1) \]

"""
To mu definition below
----------------------

vector[N] a;
vector[N] b;
vector[N] c; 

These operation:
c = a .* b;

Is the same operation:
for (n in 1:N) {
  c[n] = a[n] * b[n];
}

Reference:
https://mc-stan.org/docs/reference-manual/arithmetic-expressions.html
"""

model = """
    data {
        int N;
        vector[N] h0;
        vector[N] h1;
        vector[N] T;  // Treatment
        vector[N] F;  // Fungus
    }

    parameters {
        real alpha;
        real bT;
        real bF;
        real<lower=0> sigma;
    }

    model {
        vector[N] mu;
        vector[N] p;
        
        p = alpha + bT * T + bF * F;
        mu = h0 .* p;  
    
        // likelihood
        h1 ~ normal(mu, sigma);
    
        // prioris
        alpha ~ lognormal(0, 0.25);
        bT ~ normal(0, 0.5);
        bF ~ normal(0, 0.5);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': N,
    'h0': h0,
    'h1': h1,
    'T': treatment,
    'F': fungus,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

Vide.summary(samples)

	mean	std	7.0%	93.0%
alpha	1.53	0.03	1.48	1.59
bT	-0.03	0.04	-0.09	0.03
bF	-0.32	0.04	-0.39	-0.25
sigma	1.44	0.11	1.26	1.64

Vide.plot_forest(samples, title='Treatment and Fungus')

R Code 6.17¶

model = """
    data {
        int N;
        vector[N] h0;
        vector[N] h1;
        vector[N] T;  // Treatment
    }

    parameters {
        real<lower=0> alpha;
        real bT;
        real<lower=0> sigma;
    }
    
    model {
        vector[N] mu;
        vector[N] p;

        p = alpha + bT * T;
        mu = h0 .* p;

        h1 ~ normal(mu, sigma);

        alpha ~ lognormal(0, 0.2);
        bT ~ normal(0, 0.5);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': N,
    'h0': h0,
    'h1': h1,
    'T': treatment,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

Vide.summary(samples)

	mean	std	7.0%	93.0%
alpha	1.36	0.03	1.31	1.41
bT	0.13	0.04	0.06	0.20
sigma	1.89	0.14	1.65	2.14

Vide.plot_forest(samples, title='Treatment without fungus')

R Code 6.18¶

G = nx.DiGraph()

nodes = {0: '$H_0$', 
         1: '$H_1$', 
         2: 'F', 
         3: 'T'}

for i in nodes:
    G.add_node(nodes[i])

edges = [(nodes[0], nodes[1]),
         (nodes[2], nodes[1]),
         (nodes[3], nodes[2])]

G.add_edges_from(edges)

# explicitly set positions
pos = {nodes[0]: (0, 0), 
       nodes[1]: (1, 0), 
       nodes[2]: (1.5, 0), 
       nodes[3]: (2, 0)}

options = {
    "font_size": 15,
    "node_size": 400,
    "node_color": "white",
    "edgecolors": "white",
    "linewidths": 1,
    "width": 1,
}

nx.draw(G, pos, with_labels=True, **options)

# Set margins for the axes so that nodes aren't clipped
ax = plt.gca()
# ax.margins(0.01)
plt.axis("off")
plt.show()

R Code 6.19¶

\[F \_||\_ H_0\]

\[H_0 \_||\_ T\]

\[ H_1 \_||\_ T | F \]

R Code 6.20¶

# np.random.seed(3)

# Quantidade de plantas
N = 100

# Simulação inicial das alturas
h0 = np.random.normal(10, 2, N)

# Atribuindo tratamentos e simulando fungos e tratamentos
treatment = np.repeat([0, 1], repeats=int(N/2))

M = np.random.binomial(n=1, p=0.5, size=N)  # Moisture -> Bernoulli(p=0.5)

fungus = np.random.binomial(n=1, p=(0.5 - treatment * 0.4 + 0.4 * M), size=N)
h1 = h0 + np.random.normal((5 + 3 * M), 1, N)

# Dataframe
d2 = pd.DataFrame.from_dict({'h0': h0, 
                            'h1': h1, 
                            'treatment': treatment, 
                            'fungus': fungus})
d2.describe().T 

	count	mean	std	min	25%	50%	75%	max
h0	100.0	10.091610	1.976358	6.313914	8.850678	10.169278	11.369749	15.273222
h1	100.0	16.510702	2.715874	10.105379	14.711143	15.899131	18.671828	22.662508
treatment	100.0	0.500000	0.502519	0.000000	0.000000	0.500000	1.000000	1.000000
fungus	100.0	0.480000	0.502117	0.000000	0.000000	0.000000	1.000000	1.000000

# RCode 6.17 with new database

model = """
    data {
        int N;
        vector[N] h0;
        vector[N] h1;
        vector[N] T;  // Treatment
    }

    parameters {
        real<lower=0> alpha;
        real bT;
        real<lower=0> sigma;
    }
    
    model {
        vector[N] mu;
        vector[N] p;

        p = alpha + bT * T;
        mu = h0 .* p;

        h1 ~ normal(mu, sigma);

        alpha ~ lognormal(0, 0.2);
        bT ~ normal(0, 0.5);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': N,
    'h0': d2.h0.values,
    'h1': d2.h1.values,
    'T': d2.treatment.values,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

Vide.summary(samples)

	mean	std	7.0%	93.0%
alpha	1.60	0.03	1.55	1.65
bT	0.02	0.04	-0.05	0.09
sigma	2.16	0.15	1.87	2.43

Vide.plot_forest(samples, title="Only Treatment using d2")

# RCode 6.16 with new database

model = """
    data {
        int N;
        vector[N] h0;
        vector[N] h1;
        vector[N] T;  // Treatment
        vector[N] F;  // Fungus
    }

    parameters {
        real alpha;
        real bT;
        real bF;
        real<lower=0> sigma;
    }

    model {
        vector[N] mu;
        vector[N] p;
        
        p = alpha + bT * T + bF * F;
        mu = h0 .* p;  
    
        // likelihood
        h1 ~ normal(mu, sigma);
    
        // prioris
        alpha ~ lognormal(0, 0.25);
        bT ~ normal(0, 0.5);
        bF ~ normal(0, 0.5);
        sigma ~ exponential(1);
    }
"""

data = {
    'N': N,
    'h0': d2.h0.values,
    'h1': d2.h1.values,
    'T': d2.treatment.values,
    'F': d2.fungus.values,
}

posteriori = stan.build(model, data=data)
samples = posteriori.sample(num_chains=4, num_samples=1000)

Vide.summary(samples)

	mean	std	7.0%	93.0%
alpha	1.53	0.04	1.46	1.60
bT	0.06	0.04	-0.02	0.14
bF	0.11	0.04	0.03	0.19
sigma	2.10	0.15	1.83	2.36

Vide.plot_forest(samples, title="Treatments and Fungus")

Collider Bias¶

6.21¶

Simulação

Cada ano, \(20\) pessoas nascem com valores de felicidade uniformente distribuídos
Cada ano, cada uma das pessoas envelhece \(1\) ano. A sua felicidade não muda.
Aos \(18\) anos, um indivíduo de casa com a probabilidade de porporcional a sua felicidade.
Uma vez casado, o indivíduo se mantém casado.
Aos 65 anos, o indivíduo deixa a amostra (Vai morar na Espanha)

# Function based in https://github.com/rmcelreath/rethinking/blob/master/R/sim_happiness.R
# Inv_logit R-function in https://stat.ethz.ch/R-manual/R-devel/library/boot/html/inv.logit.html

def inv_logit(x):
    return np.exp(x) / (1 + np.exp(x))

def sim_happiness(seed=1977 , N_years=1000 , max_age=65 , N_births=20 , aom=18):
    np.random.seed(seed)
    
    df = pd.DataFrame(columns=['age', 'married', 'happiness'])

    for i in range(N_years):
        # Update age
        df['age'] += 1

        # Move to Spain when age == max_age
        df.drop(df[df['age'] == max_age].index, inplace=True)
        
        # Will marry?
        index_unmarried_aom = df.query((f'age>={aom} and married==0')).index.tolist()
        weddings = np.random.binomial(1, inv_logit(df.loc[index_unmarried_aom, 'happiness'] - 4))
        df.loc[index_unmarried_aom, 'married'] = weddings
        
        # New borns
        df_aux = pd.DataFrame(columns=['age', 'married', 'happiness'])

        df_aux.loc[:, 'age'] = np.zeros(N_births).astype(int)
        df_aux.loc[:, 'married'] = np.zeros(N_births).astype(int)
        df_aux.loc[:, 'happiness'] = np.linspace(-2, 2, N_births)  # np.random.uniform(0, 1, N_births)
        
        df = df.append(df_aux, ignore_index=True)
        
    return df

df = sim_happiness(seed=1997, N_years=1000)
df.describe(percentiles=[0.055, 0.945], include='all').T

	count	unique	top	freq	mean	std	min	5.5%	50%	94.5%	max
age	1300.0	65.0	64.0	20.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
married	1300.0	2.0	0.0	950.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
happiness	1300.0	NaN	NaN	NaN	-8.335213e-17	1.214421	-2.0	-1.789474	-1.110223e-16	1.789474	2.0

# Figure 6.21
plt.figure(figsize=(17, 6))

colors = ['white' if is_married == 0 else 'blue' for is_married in df.married ]

plt.scatter(df.age, df.happiness, color=colors)

plt.title('White - Unmarried   |    Blue - Married')
plt.xlabel('Age')
plt.ylabel('Happiness')
plt.show()

R Code 6.22¶

df2 = df[df.age > 17].copy()  # Only adults
df2.loc[:, 'age'] = (df2.age - 18) / (65 - 18)

R Code 6.23¶

Modelo:

\[ happiness \sim Normal(\mu_i, \sigma) \]

\[ \mu_i = \alpha_{_{MID}[i]} + \beta_A \times A_i \]