Dualizaiton working paper

미정
propensity score를 활용해서 외부자성을 계산해보자.
Author

STATAID

Published

2025.04.25

Important

실험적 단계의 연구논문으로, 아직 미완성된 게시글입니다.

1 Rawdata

Code
require(moonBook)
require(tidyverse)
require(haven)
require(gridExtra)
Code
p23_raw <- read_dta("../../../rawdata/klips23p.dta")
p05_raw <- read_dta("../../../rawdata/klips05p.dta")

h23_raw <- read_dta("../../../rawdata/klips23h.dta") %>%
  mutate(wage = h232102 / sqrt(h230150)) %>%
  select(hhid23, wage) %>%
  drop_na()

h05_raw <- read_dta("../../../rawdata/klips05h.dta") %>%
  mutate(wage = h052102 / sqrt(h050150)) %>%
  select(hhid05, wage) %>%
  drop_na()
Code
p23 <- p23_raw %>% 
  left_join(h23_raw, by = "hhid23") %>% 
  mutate(
    empsta = ifelse(is.na(p230314), NA, p230314),
    male = ifelse(p230101 == 1, 1, 0) %>% as.factor(),
    age = p230107,
    youth = case_when(
      age >= 15 & age < 35 ~ 1,
      age >= 35 & age <= 64 ~ 0,
      TRUE ~ NA
    ) %>% as.factor(),
  ) %>% 
  mutate(
    partim = ifelse(p230315 == 1, 1, 0) %>% as.factor(),
    fixter_1 = ifelse(p230501 == 1, 1, 0) %>% as.factor(),
    fixter_2 = case_when(
      p230501 == 2 & p230601 == 1 & p230602 == 2 ~ 1,
      p230501 == 2 & p230601 == 2 & (p230605>=1 & p230605<=6) ~ 1,
      TRUE ~ 0
    ) %>% as.factor(),
    fixter_3 = ifelse(empsta == 2, 1, 0) %>% as.factor(),
    fixter = ifelse(fixter_1 == 1 | fixter_2 == 1 | fixter_3 == 2, 1, 0) %>% as.factor(),
    daiwor_1 = if_else(p230508 == 1, 1, 0, 0) %>% as.factor(),
    daiwor_2 = ifelse(empsta == 3, 1, 0) %>% as.factor(),
    daiwor = ifelse(daiwor_1 == 1 | daiwor_2 == 2, 1, 0) %>% as.factor(),
    agewor_1 = ifelse(p230611 == 2, 1, 0) %>% as.factor(),
    agewor_2 = ifelse(p230611 == 3, 1, 0) %>% as.factor(),
    agewor = ifelse(agewor_1 == 1 | agewor_2 == 1, 1, 0) %>% as.factor(),
    domwor = ifelse(p230613 == 1, 1, 0) %>% as.factor(),
    speemp = ifelse(p230612 == 1, 1, 0) %>% as.factor(),
    natpen = ifelse(p232101 == 1, 1, 0) %>% as.factor(),
    spepen = ifelse(p232102 == 1, 1, 0) %>% as.factor(),
    totpen = ifelse(natpen == 1 | spepen == 1, 1, 0) %>% as.factor(),
    nathel = ifelse(p232103 == 1, 1, 0) %>% as.factor(),
    empins = ifelse(p232104 == 1 | spepen == 1, 1, 0) %>% as.factor(),
    indins = ifelse(p232105 == 1 | spepen == 1, 1, 0) %>% as.factor(),
    retben = ifelse(p234101 == 1, 1, 0) %>% as.factor(),
  ) %>% 
  mutate(
    standard = ifelse(partim == 0 
                      & fixter == 0 
                      & daiwor == 0 
                      & agewor == 0 
                      & domwor == 0 
                      & speemp == 0,
                      1,
                      0) %>% as.factor(), 
    socins = ifelse(totpen == 1 
                    & nathel == 1 
                    & empins == 1 
                    & indins == 1, 
                    1, 
                    0) %>% as.factor(),
  ) %>% 
  mutate(
    KSCO5=p230350 %>% str_sub(1, 3) %>% as.factor(),
    class_serv = case_when(
      p230350 >= 140 & p230350 < 160 ~ 1,
      p230350 >= 170 & p230350 < 190 ~ 1,
      p230350 >= 240 & p230350 < 260 ~ 1,
      p230350 >= 270 & p230350 < 290 ~ 1,
      p230350 >= 310 & p230350 < 540 ~ 1,
      TRUE ~ 0
    ), 
    class3 = case_when(
      (p230350 >= 400 & p230350<600) | p230350 >= 900 ~ 1,
      (p230350 >= 300 & p230350<400) | (p230350 >= 700 & p230350<900) ~ 2,
      p230350 >= 0 & p230350<300 ~ 3
    ),
    class5 = case_when(
      class_serv == 1 & class3 == 1 ~ 1,
      class_serv == 0 & class3 == 1 ~ 2,
      class_serv == 1 & class3 == 2 ~ 3,
      class_serv == 0 & class3 == 2 ~ 4,
      class3 == 3 ~ 5,
      TRUE ~ NA
    ) %>% as.factor(),
    large_firm = case_when(
      (p230402 >= 1 & p230402 <= 299) | p230403 <= 7 ~ 1,
      p230402 >= 300 | (p230403 >= 8 & p230403 <= 10) ~ 2, # | p230401 == 5
      is.na(p230402) == TRUE & is.na(p230403) == TRUE ~ NA,
      TRUE ~ NA
    ) %>% as.factor(),
    firm_group = case_when(
      (p230402 >= 1 & p230402 <= 4) | p230403 <= 1 ~ 1,
      (p230402 >= 5 & p230402 <= 29) | (p230403 >= 2 & p230403 <= 3) ~ 2,
      (p230402 >= 30 & p230402 <= 99) | (p230403 >= 4 & p230403 <= 6) ~ 3,
      (p230402 >= 100 & p230402 <= 299) | (p230403 >= 7 & p230403 <= 7) ~ 4,
      p230402 >= 300 | (p230403 >= 8 & p230403 <= 10) ~ 5,
      TRUE ~ NA
    ) %>% as.factor(),
    firmsize = case_when(
      (p230402 >= 1 & p230402 <= 4) | p230403 == 1 ~ 1,
      (p230402 >= 5 & p230402 <= 9) | p230403 == 2 ~ 2,
      (p230402 >= 10 & p230402 <= 29) | p230403 == 3 ~ 3,
      (p230402 >= 30 & p230402 <= 49) | p230403 == 4 ~ 4,
      (p230402 >= 50 & p230402 <= 69) | p230403 == 5 ~ 5,
      (p230402 >= 70 & p230402 <= 99) | p230403 == 6 ~ 6,
      (p230402 >= 100 & p230402 <= 299) | p230403 == 7 ~ 7,
      (p230402 >= 300 & p230402 <= 499) | p230403 == 8 ~ 8,
      (p230402 >= 500 & p230402 <= 999) | p230403 == 9 ~ 9,
      p230402 >= 1000 | p230403 == 10 ~ 10,
      TRUE ~ NA
    ) %>% as.factor()
  ) %>% 
  filter(empsta <= 3) %>% 
  drop_na(male, age, youth, wage, class5, firm_group, large_firm, KSCO5)
Code
p05 <- p05_raw %>% 
  left_join(h05_raw, by = "hhid05") %>% 
  mutate(
    empsta = ifelse(is.na(p050314), NA, p050314),
    male = ifelse(p050101==1, 1, 0) %>% as.factor(),
    age = p050107,
    youth = case_when(
      age >= 15 & age < 35 ~ 1,
      age >= 35 & age<=64 ~ 0,
      TRUE ~ NA
    ) %>% as.factor(),
  ) %>% 
  mutate(
    partim = ifelse(p050315 == 1, 1, 0) %>% as.factor(),
    fixter_1 = ifelse(p050501 == 1, 1, 0) %>% as.factor(),
    fixter_2 = case_when(p050501 == 2 &
                           p050601 == 2 &
                           (p050605 >= 1 & p050605 <= 6) ~ 1, TRUE ~ 0) %>% as.factor(),
    fixter_3 = ifelse(empsta == 2, 1, 0) %>% as.factor(),
    fixter = ifelse(fixter_1 == 1 | fixter_2 == 1 | fixter_3 == 2, 1, 0) %>% as.factor(),
    daiwor_1 = if_else(p050508 == 1, 1, 0, 0) %>% as.factor(),
    daiwor_2 = ifelse(empsta == 3, 1, 0) %>% as.factor(),
    daiwor = ifelse(daiwor_1 == 1 | daiwor_2 == 2, 1, 0) %>% as.factor(),
    agewor_1=  ifelse(p050611 == 2, 1, 0) %>% as.factor(),
    agewor_2 = ifelse(p050611 == 3, 1, 0) %>% as.factor(),
    agewor = ifelse(agewor_1 == 1 | agewor_2 == 1, 1, 0) %>% as.factor(),
    domwor = ifelse(p050613 == 1, 1, 0) %>% as.factor(),
    speemp = ifelse(p050612 == 1, 1, 0) %>% as.factor(),
    natpen = ifelse(p052101 == 1, 1, 0) %>% as.factor(),
    spepen = ifelse(p052102 == 1, 1, 0) %>% as.factor(),
    totpen = ifelse(natpen == 1 | spepen == 1, 1, 0) %>% as.factor(),
    nathel = ifelse(p052103 == 1, 1, 0) %>% as.factor(),
    empins = ifelse(p052104 == 1 | spepen == 1, 1, 0) %>% as.factor(),
    indins = ifelse(p052105 == 1 | spepen == 1, 1, 0) %>% as.factor(),
    retben = ifelse(p054101 == 1, 1, 0) %>% as.factor(),
  ) %>% 
  mutate(
    standard = ifelse(partim == 0 
                      & fixter == 0 
                      & daiwor == 0 
                      & agewor == 0 
                      & domwor == 0 
                      & speemp == 0, 
                      1, 
                      0),
    socins = ifelse(totpen == 1 
                    & nathel == 1 
                    & empins == 1 
                    & indins == 1, 
                    1, 
                    0)
  ) %>% 
  mutate(
    KSCO5=p050350 %>% str_sub(1, 3) %>% as.factor(),
    class_serv=case_when(
      p050350 >= 140 & p050350 < 160 ~ 1,
      p050350 >= 170 & p050350 < 190 ~ 1,
      p050350 >= 240 & p050350 < 260 ~ 1,
      p050350 >= 270 & p050350 < 290 ~ 1,
      p050350 >= 310 & p050350 < 540 ~ 1,
      TRUE ~ 0
    ) %>% as.factor(),
    class3=case_when(
      (p050350 >= 400 & p050350 < 600) | p050350 >= 900 ~ 1,
      (p050350 >= 300 & p050350 < 400) | (p050350 >= 700 & p050350 < 900) ~ 2,
      p050350>=0 & p050350<300 ~ 3
    ) %>% as.factor(),
    class5 = case_when(
      class_serv == 1 & class3 == 1 ~ 1,
      class_serv == 0 & class3 == 1 ~ 2,
      class_serv == 1 & class3 == 2 ~ 3,
      class_serv == 0 & class3 == 2 ~ 4,
      class3 == 3 ~ 5,
      TRUE ~ NA
    ) %>% as.factor(),
    large_firm = case_when(
      (p050402 >= 1 & p050402 <= 299) | p050403 <= 7 ~ 1,
      p050402 >= 300 | (p050403 >= 8 & p050403 <= 10) ~ 2, # | p050401 == 5
      is.na(p050402) == TRUE & is.na(p050403) == TRUE ~ NA,
      TRUE ~ NA
    ) %>% as.factor(),
    firm_group = case_when(
      (p050402 >= 1 & p050402 <= 4) | p050403 <= 1 ~ 1,
      (p050402 >= 5 & p050402 <= 29) | (p050403 >= 2 & p050403 <= 3) ~ 2,
      (p050402 >= 30 & p050402 <= 99) | (p050403 >= 4 & p050403 <= 6) ~ 3,
      (p050402 >= 100 & p050402 <= 299) | (p050403 >= 7 & p050403 <= 7) ~ 4,
      p050402 >= 300 | (p050403 >= 8 & p050403 <= 10) ~ 5,
      TRUE ~ NA
    ) %>% as.factor(),
    firmsize = case_when(
      (p050402 >= 1 & p050402 <= 4) | p050403 == 1 ~ 1,
      (p050402 >= 5 & p050402 <= 9) | p050403 == 2 ~ 2,
      (p050402 >= 10 & p050402 <= 29) | p050403 == 3 ~ 3,
      (p050402 >= 30 & p050402 <= 49) | p050403 == 4 ~ 4,
      (p050402 >= 50 & p050402 <= 69) | p050403 == 5 ~ 5,
      (p050402 >= 70 & p050402 <= 99) | p050403 == 6 ~ 6,
      (p050402 >= 100 & p050402 <= 299) | p050403 == 7 ~ 7,
      (p050402 >= 300 & p050402 <= 499) | p050403 == 8 ~ 8,
      (p050402 >= 500 & p050402 <= 999) | p050403 == 9 ~ 9,
      p050402 >= 1000 | p050403 == 10 ~ 10,
      TRUE ~ NA
    ) %>% as.factor()
  ) %>% 
  filter(empsta <= 3) %>% 
  drop_na(male, age, youth, wage, class5, firm_group, large_firm, KSCO5)

2 Scoring The Propensity

Code
staemp23 <- glm(standard ~ male + age + log(wage) + class5, 
                family = binomial, 
                data = p23,
                na.action = na.exclude)
# socins23 <- glm(socins ~ male + age + log(wage) + class5 + firmsize, 
#                 family = binomial, 
#                 data = p23,
#                 na.action = na.exclude)

socins23 <- glm(socins ~ male + age + log(wage) + 
                  partim +
                  fixter_1 +
                  fixter_2 +
                  fixter_3 +
                  daiwor_1 +
                  daiwor_2 +
                  agewor_1 +
                  agewor_2 +
                  domwor +
                  speemp,
                family = binomial, 
                data = p23,
                na.action = na.exclude)

x23 <- predict(staemp23, type = "response")
y23 <- predict(socins23, type = "response")

staemp05 <- glm(standard ~ male + age + log(wage) + class5, 
                family = binomial, 
                data = p05,
                na.action = na.exclude)
# socins05 <- glm(socins ~ male + age + wage + class5 + firmsize, 
#                 family = binomial, 
#                 data = p05,
#                 na.action = na.exclude)

socins05 <- glm(socins ~ male + age + log(wage) + class5 +
                  partim +
                  fixter_1 +
                  fixter_2 +
                  fixter_3 +
                  daiwor_1 +
                  daiwor_2 +
                  agewor_1 +
                  agewor_2 +
                  domwor +
                  speemp, 
                family = binomial, 
                data = p05,
                na.action = na.exclude)

x05 <- predict(staemp05, type = "response")
y05 <- predict(socins05, type = "response")


df23 <- p23 %>% 
  mutate(x = x23,
         y = y23) %>% 
  select(pid, x, y, standard, socins,
         male, age, youth, wage, 
         class5, firm_group, large_firm)

g23 <- ggplot(data = df23, aes(x = x, y = y)) +
  scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits = c(0, 1))

df05 <- p05 %>% 
  mutate(x = x05,
         y = y05) %>% 
  select(pid, x, y, standard, socins,
         male, age, youth, wage, 
         class5, firm_group, large_firm)

g05 <- ggplot(data = df05, aes(x = x, y = y)) +
  scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits = c(0, 1))
Code
coef05 <- lm(y ~ x * large_firm, df05)$coefficient
coef23 <- lm(y ~ x * large_firm, df23)$coefficient

아래 왼쪽은 2005년, 오른쪽은 2023년

구분 ’05년 중소기업 ’05년 대기업 ’23년 중소기업 ’23년 대기업
절편 -0.986902 -1.0469062 0.456389 0.638658
계수 2.0717141 2.2160975 0.5373582 0.3605782
Code
grid.arrange(
  g23 + 
    theme(legend.position = c(0.2, 0.5),
          legend.background = element_rect(fill = alpha(0.1))) +
    geom_smooth(data = df23 %>% filter(large_firm == 1), 
                method = "lm", 
                color = "#d0473e") +
    geom_point(data = df23 %>% filter(large_firm == 1) %>% slice_sample(n = 100),
               size = 3,
               color = "#d0473e",
               alpha = 0.1) +
    geom_point(data = df23 %>% filter(large_firm == 2) %>% slice_sample(n = 100),
               size = 3,
               color = "#306ec0",
               alpha = 0.01),
  g23 +
    theme(legend.position = c(0.2, 0.5),
          legend.background = element_rect(fill = alpha(0.1))) +
    geom_smooth(data = df23 %>% filter(large_firm == 2), 
                method = "lm", 
                color = "#306ec0") +
    geom_point(data = df23 %>% filter(large_firm == 2) %>% slice_sample(n = 100),
               size = 3,
               color = "#306ec0",
               alpha = 0.1) +
    geom_point(data = df23 %>% filter(large_firm == 1) %>% slice_sample(n = 100),
               size = 3,
               color = "#d0473e", 
               alpha = 0.01),
  g23 + 
    theme(legend.position = c(0.2, 0.5),
          legend.background = element_rect(fill = alpha(0.1))) +
    geom_point(aes(color = class5), size = 3, alpha = 0.1),
  g23 +
    theme(legend.position = c(0.2, 0.5),
          legend.background = element_rect(fill = alpha(0.1))) +
    geom_point(aes(color = male), size = 3, alpha = 0.1),
  nrow = 2,
  ncol = 2
)

Code
grid.arrange(
  g05 + 
    theme(legend.position = c(0.2, 0.5),
          legend.background = element_rect(fill = alpha(0.1))) +
    geom_smooth(data = df05 %>% filter(large_firm == 1), 
                method = "lm", 
                color = "#d0473e") +
    geom_point(data = df05 %>% filter(large_firm == 1) %>% slice_sample(n = 100),
               size = 3,
               color = "#d0473e",
               alpha = 0.1) +
    geom_point(data = df05 %>% filter(large_firm == 2) %>% slice_sample(n = 100),
               size = 3,
               color = "#306ec0",
               alpha = 0.01),
  g05 +
    theme(legend.position = c(0.2, 0.5),
          legend.background = element_rect(fill = alpha(0.1))) +
    geom_smooth(data = df05 %>% filter(large_firm == 2), 
                method = "lm", 
                color = "#306ec0") +
    geom_point(data = df05 %>% filter(large_firm == 2) %>% slice_sample(n = 100),
               size = 3,
               color = "#306ec0",
               alpha = 0.1) +
    geom_point(data = df05 %>% filter(large_firm == 1) %>% slice_sample(n = 100),
               size = 3,
               color = "#d0473e", 
               alpha = 0.01),
  g05 + 
    theme(legend.position = c(0.2, 0.5),
          legend.background = element_rect(fill = alpha(0.1))) +
    geom_point(aes(color = class5), size = 3, alpha = 0.1),
  g05 +
    theme(legend.position = c(0.2, 0.5),
          legend.background = element_rect(fill = alpha(0.1))) +
    geom_point(aes(color = male), size = 3, alpha = 0.1),
  nrow = 2,
  ncol = 2
)

hist(df23$y)

hist(df05$y)

어렵다 어려워. 직종을 줄이니 차이가 거의 없어짐. 대기업효과 보다는 직종, 성별 효과가 더 큰걸까? 만약 그게 맞다면, 뭐하러 이 그래프를 그리냐? 그냥 상호작용 항으로 선형모델에 적합시키면 되지.

df23 <- df23 %>% 
  mutate(
    x.q = case_when(
      x > summary(df23$x)[5] ~ "1/4",
      x > summary(df23$x)[3] & x <= summary(df23$x)[5] ~ "2/4",
      x > summary(df23$x)[2] & x <= summary(df23$x)[3] ~ "3/4",
      x <= summary(df23$x)[2] ~ "4/4",
    ),
    y.q = case_when(
      y > summary(df23$y)[5] ~ "1/4",
      y > summary(df23$y)[3] & y <= summary(df23$y)[5] ~ "2/4",
      y > summary(df23$y)[2] & y <= summary(df23$y)[3] ~ "3/4",
      y <= summary(df23$y)[2] ~ "4/4",  
    )
  )

mytable_sub(x.q ~ male + youth + class5 + large_firm + standard + socins, df23)

                   Descriptive Statistics by 'x.q'                  
————————————————————————————————————————————————————————————————————— 
                1/4          2/4          3/4          4/4        p  
              (N=1621)     (N=1620)     (N=1620)     (N=1621)  
————————————————————————————————————————————————————————————————————— 
 male                                                           0.000
   - 0       87 ( 5.4%)  581 (35.9%)  844 (52.1%)  1217 (75.1%)      
   - 1      1534 (94.6%) 1039 (64.1%) 776 (47.9%)  404 (24.9%)       
 youth                                                          0.000
   - 0      1198 (73.9%) 1203 (74.3%) 1305 (80.6%) 1396 (86.1%)      
   - 1      423 (26.1%)  417 (25.7%)  315 (19.4%)  225 (13.9%)       
 class5                                                         0.000
   - 1       1 ( 0.1%)    16 ( 1.0%)  190 (11.7%)  764 (47.1%)       
   - 2        0 ( 0.0%)    0 ( 0.0%)   8 ( 0.5%)   595 (36.7%)       
   - 3      827 (51.0%)  420 (25.9%)  190 (11.7%)   7 ( 0.4%)        
   - 4       92 ( 5.7%)  679 (41.9%)  608 (37.5%)  201 (12.4%)       
   - 5      701 (43.2%)  505 (31.2%)  624 (38.5%)   54 ( 3.3%)       
 large_firm                                                     0.000
   - 1      1054 (65.0%) 1238 (76.4%) 1407 (86.9%) 1475 (91.0%)      
   - 2      567 (35.0%)  382 (23.6%)  213 (13.1%)  146 ( 9.0%)       
 standard                                                       0.000
   - 0      183 (11.3%)  309 (19.1%)  523 (32.3%)  905 (55.8%)       
   - 1      1438 (88.7%) 1311 (80.9%) 1097 (67.7%) 716 (44.2%)       
 socins                                                         0.000
   - 0      113 ( 7.0%)  205 (12.7%)  371 (22.9%)  701 (43.2%)       
   - 1      1508 (93.0%) 1415 (87.3%) 1249 (77.1%) 920 (56.8%)       
————————————————————————————————————————————————————————————————————— 
mytable_sub(y.q ~ male + youth + class5 + large_firm + standard + socins, df23)

                   Descriptive Statistics by 'y.q'                  
————————————————————————————————————————————————————————————————————— 
                1/4          2/4          3/4          4/4        p  
              (N=1621)     (N=1620)     (N=1620)     (N=1621)  
————————————————————————————————————————————————————————————————————— 
 male                                                           0.000
   - 0      548 (33.8%)  480 (29.6%)  725 (44.8%)  976 (60.2%)       
   - 1      1073 (66.2%) 1140 (70.4%) 895 (55.2%)  645 (39.8%)       
 youth                                                          0.000
   - 0      714 (44.0%)  1493 (92.2%) 1544 (95.3%) 1351 (83.3%)      
   - 1      907 (56.0%)  127 ( 7.8%)   76 ( 4.7%)  270 (16.7%)       
 class5                                                         0.000
   - 1      121 ( 7.5%)  140 ( 8.6%)  212 (13.1%)  498 (30.7%)       
   - 2       45 ( 2.8%)   53 ( 3.3%)  147 ( 9.1%)  358 (22.1%)       
   - 3      483 (29.8%)  474 (29.3%)  337 (20.8%)  150 ( 9.3%)       
   - 4      314 (19.4%)  409 (25.2%)  494 (30.5%)  363 (22.4%)       
   - 5      658 (40.6%)  544 (33.6%)  430 (26.5%)  252 (15.5%)       
 large_firm                                                     0.000
   - 1      1106 (68.2%) 1188 (73.3%) 1386 (85.6%) 1494 (92.2%)      
   - 2      515 (31.8%)  432 (26.7%)  234 (14.4%)  127 ( 7.8%)       
 standard                                                       0.000
   - 0      104 ( 6.4%)  137 ( 8.5%)  291 (18.0%)  1388 (85.6%)      
   - 1      1517 (93.6%) 1483 (91.5%) 1329 (82.0%) 233 (14.4%)       
 socins                                                         0.000
   - 0       74 ( 4.6%)   92 ( 5.7%)  248 (15.3%)  976 (60.2%)       
   - 1      1547 (95.4%) 1528 (94.3%) 1372 (84.7%) 645 (39.8%)       
—————————————————————————————————————————————————————————————————————