@@ -958,3 +958,117 @@ make_pliv_multiway_cluster_CKMS2021 = function(N = 25, M = 25, dim_X = 100,
958958 }
959959 }
960960}
961+
962+ # ' Generates data from a sample selection model (SSM).
963+ # '
964+ # ' The data generating process is defined as:
965+ # '
966+ # ' \deqn{
967+ # ' y_i = \theta d_i + x_i' \beta + u_i,}
968+ # '
969+ # ' \deqn{s_i = 1\lbrace d_i + \gamma z_i + x_i' \beta + v_i > 0 \rbrace,}
970+ # '
971+ # ' \deqn{d_i = 1\lbrace x_i' \beta + w_i > 0 \rbrace,}
972+ # '
973+ # ' with \eqn{y_i} being observed if \eqn{s_i = 1} and covariates \eqn{x_i \sim \mathcal{N}(0, \Sigma^2_x)}, where
974+ # ' \eqn{\Sigma^2_x} is a matrix with entries
975+ # ' \eqn{\Sigma_{kj} = 0.5^{|j-k|}}.
976+ # ' \eqn{\beta} is a \code{dim_x}-vector with entries \eqn{\beta_j=\frac{0.4}{j^2}}
977+ # ' \eqn{z_i \sim \mathcal{N}(0, 1)},
978+ # ' \eqn{(u_i,v_i) \sim \mathcal{N}(0, \Sigma^2_{u,v})},
979+ # ' \eqn{w_i \sim \mathcal{N}(0, 1)}.
980+ # '
981+ # ' The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia,
982+ # ' Huber and Lafférs (2023).
983+ # '
984+ # ' @param n_obs (`integer(1)`) \cr
985+ # ' The number of observations to simulate.
986+ # ' @param dim_x (`integer(1)`) \cr
987+ # ' The number of covariates.
988+ # ' @param theta (`numeric(1)`) \cr
989+ # ' The value of the causal parameter.
990+ # ' @param mar (`logical(1)`) \cr
991+ # ' Indicates whether missingness at random holds.
992+ # ' @param return_type (`character(1)`) \cr
993+ # ' If `"DoubleMLData"`, returns a `DoubleMLData` object.
994+ # ' If `"data.frame"` returns a `data.frame()`.
995+ # ' If `"data.table"` returns a `data.table()`.
996+ # ' Default is `"DoubleMLData"`.
997+ # '
998+ # ' @references Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models,
999+ # ' Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071
1000+ # '
1001+ # ' @return Depending on the `return_type`, returns an object or set of objects as specified.
1002+ # ' @export
1003+ make_ssm_data = function (n_obs = 8000 , dim_x = 100 , theta = 1 , mar = TRUE , return_type = " DoubleMLData" ) {
1004+
1005+ assert_choice(
1006+ return_type ,
1007+ c(" data.table" , " matrix" , " data.frame" , " DoubleMLData" )
1008+ )
1009+
1010+ assert_count(n_obs )
1011+ assert_count(dim_x )
1012+ assert_numeric(theta , len = 1 )
1013+
1014+ if (mar == TRUE ) {
1015+ sigma = matrix (c(1 , 0 , 0 , 1 ), 2 , 2 )
1016+ gamma = 0
1017+ } else {
1018+ sigma = matrix (c(1 , 0.8 , 0.8 , 1 ), 2 , 2 )
1019+ gamma = 1
1020+ }
1021+
1022+ e = t(rmvnorm(n_obs , rep(0 , 2 ), sigma ))
1023+ cov_mat = toeplitz(0.5 ^ (0 : (dim_x - 1 )))
1024+ x = rmvnorm(n_obs , rep(0 , dim_x ), cov_mat )
1025+ beta = 0.4 / ((1 : dim_x )^ 2 )
1026+ d = ifelse(x %*% beta + rnorm(n_obs ) > 0 , 1 , 0 )
1027+ z = as.matrix(rnorm(n_obs ))
1028+ s = ifelse(x %*% beta + d + gamma * z + e [1 , ] > 0 , 1 , 0 )
1029+ y = x %*% beta + theta * d + e [2 , ]
1030+ y [s == 0 ] = 0
1031+
1032+ colnames(x ) = paste0(" X" , 1 : dim_x )
1033+ colnames(y ) = " y"
1034+ colnames(d ) = " d"
1035+ colnames(z ) = " z"
1036+ colnames(s ) = " s"
1037+
1038+ if (return_type == " matrix" ) {
1039+ if (mar == TRUE ) {
1040+ return (list (" X" = x , " y" = y , " d" = d , " s" = s ))
1041+ } else {
1042+ return (list (" X" = x , " y" = y , " d" = d , " z" = z , " s" = s ))
1043+ }
1044+ }
1045+ if (return_type == " data.frame" ) {
1046+ if (mar == TRUE ) {
1047+ data = data.frame (x , y , d , s )
1048+ return (data )
1049+ } else {
1050+ data = data.frame (x , y , d , z , s )
1051+ return (data )
1052+ }
1053+ }
1054+ if (return_type == " data.table" ) {
1055+ if (mar == TRUE ) {
1056+ data = data.table(x , y , d , s )
1057+ return (data )
1058+ } else {
1059+ data = data.table(x , y , d , z , s )
1060+ return (data )
1061+ }
1062+ }
1063+ if (return_type == " DoubleMLData" ) {
1064+ if (mar == TRUE ) {
1065+ dt = data.table(x , y , d , s )
1066+ data = DoubleMLData $ new(dt , y_col = " y" , d_cols = " d" , s_col = " s" )
1067+ return (data )
1068+ } else {
1069+ dt = data.table(x , y , d , z , s )
1070+ data = DoubleMLData $ new(dt , y_col = " y" , d_cols = " d" , z_cols = " z" , s_col = " s" )
1071+ return (data )
1072+ }
1073+ }
1074+ }
0 commit comments