You need to enable JavaScript to run this app.
最新活动
大模型
产品
解决方案
定价
生态与合作
支持与服务
开发者
了解我们

如何结合mice与qgcomp包处理含缺失数据的混合分析?

问题描述
  • 需求:使用qgcomp开展混合分析,部分协变量存在缺失值,且部分缺失变量需用于生成新协变量(例如用缺失的身高、体重计算BMI)
  • 操作:通过mice包完成多重插补,参考《Imputation for limits of detection problems》文档结合qgcompmice,但在汇总权重步骤报错
  • 错误信息:invalid argument to unary operator,触发于代码行function(x) c(-x$neg.weights, x$pos.weights)[expnms]
复现代码
library("ggplot2")
library(mice)
library(qgcomp)
data("metals", package="qgcomp")
#Introducing missingness
metals[110, 15] <- NA
metals[13, 15] <- NA
metals[22, 15] <- NA
metals[4, 15] <- NA
metals[194, 15] <- NA
metals[67, 15] <- NA
metals[45, 15] <- NA
metals[122, 19] <- NA
metals[19, 19] <- NA
metals[83, 19] <- NA
metals[71, 19] <- NA
#performing multiple imputation
imput.metals <- mice(metals, m = 3)
#adding new variable
long1 <- complete(imput.metals, action='long', include=TRUE)
long1$ph2 <- long1$ph / (long1$total_hardness * long1$total_hardness / 50)
metals2 <- as.mids(long1)
#naming predictors and covariates
Xnm <- c('arsenic','barium','cadmium','calcium','chromium','copper',
  'iron','lead','magnesium','manganese','mercury','selenium','silver',
  'sodium','zinc'
)
covars = c('nitrate','nitrite','sulfate', 'total_alkalinity','ph2')
#doing the qgcomp analysis(here I get a warning "Warning messages: 1: glm.fit: fitted probabilities numerically 0 or 1 occurred" in the example dataset, but not with my own data)
qc.fitmetals.imp <- list(
  call = call("qgcomp.glm.boot(disease_state ~ arsenic + barium + cadmium + calcium + chromium + copper + iron + lead + magnesium + manganese +
              mercury + selenium + silver + sodium + zinc + nitrate + nitrite + sulfate + ph2 + total_alkalinity, expnms = Xnm, family=binomial()), q= 4, B=250"),
  call1 = metals2$call,
  nmis = metals2$nmis,
  analyses = lapply(1:3, function(x) qgcomp.glm.boot(disease_state ~ arsenic + barium + cadmium + calcium + chromium + copper + iron + lead + magnesium + manganese +
                                                       mercury + selenium + silver + sodium + zinc + nitrate + nitrite + sulfate + ph2 + total_alkalinity,
                                                     expnms = Xnm,
                                                     data=complete(metals2), family=binomial(), q= 4, B=250))
)
objmetals = pool(as.mira(qc.fitmetals.imp))

# MI based analysis 
summary(objmetals)
# summarizing weights(at this point I am getting the error messages) 
expnms = c('arsenic','barium','cadmium','calcium','chromium','copper',
           'iron','lead','magnesium','manganese','mercury','selenium','silver',
           'sodium','zinc')
wts = as.data.frame(t(sapply(qc.fitmetals.imp$analyses, 
                             function(x) c(-x$neg.weights, x$pos.weights)[expnms])))
eachwt = do.call(c, wts)
expwts = data.frame(Exposure = rep(expnms, each=nrow(wts)), Weight=eachwt)
ggplot(data=expwts)+ theme_classic() +
  geom_point(aes(x=Exposure, y=Weight)) +
  geom_hline(aes(yintercept=0))
问题排查与修正方案

错误根源

  1. 插补后数据集处理错误complete(imput.metals, action='long', include=TRUE)包含了原始未插补的带缺失数据,后续complete(metals2)默认调用第一个数据集(即原始带缺失数据),导致qgcomp模型拟合异常,返回结果中缺失neg.weightspos.weights元素,触发-x$neg.weights的运算符错误。
  2. 多重插补数据集调用错误lapply循环中未指定complete(metals2)action参数,每次都使用同一个数据集,未真正利用多重插补的多个结果。

修正后的完整代码

library(ggplot2)
library(mice)
library(qgcomp)
data("metals", package="qgcomp")

# 引入缺失值
metals[110, 15] <- NA
metals[13, 15] <- NA
metals[22, 15] <- NA
metals[4, 15] <- NA
metals[194, 15] <- NA
metals[67, 15] <- NA
metals[45, 15] <- NA
metals[122, 19] <- NA
metals[19, 19] <- NA
metals[83, 19] <- NA
metals[71, 19] <- NA

# 多重插补
imput.metals <- mice(metals, m = 3)

# 为每个插补数据集生成新变量(推荐方式)
imput.metals <- with(imput.metals, 
                     ph2 = ph / (total_hardness * total_hardness / 50))
metals2 <- imput.metals

# 定义暴露变量和协变量
Xnm <- c('arsenic','barium','cadmium','calcium','chromium','copper',
         'iron','lead','magnesium','manganese','mercury','selenium','silver',
         'sodium','zinc')
covars <- c('nitrate','nitrite','sulfate', 'total_alkalinity','ph2')

# 对每个插补数据集拟合qgcomp模型
qc.fitmetals.imp <- list(
  call = call("qgcomp.glm.boot", 
              formula = disease_state ~ arsenic + barium + cadmium + calcium + chromium + copper + iron + lead + magnesium + manganese +
                mercury + selenium + silver + sodium + zinc + nitrate + nitrite + sulfate + ph2 + total_alkalinity,
              expnms = Xnm, family=binomial(), q=4, B=250),
  call1 = metals2$call,
  nmis = metals2$nmis,
  analyses = lapply(1:metals2$m, function(x) {
    # 取出第x个插补后的数据集
    dat <- complete(metals2, action = x)
    qgcomp.glm.boot(disease_state ~ arsenic + barium + cadmium + calcium + chromium + copper + iron + lead + magnesium + manganese +
                      mercury + selenium + silver + sodium + zinc + nitrate + nitrite + sulfate + ph2 + total_alkalinity,
                    expnms = Xnm,
                    data = dat, family=binomial(), q=4, B=250)
  })
)

# 合并插补结果
objmetals <- pool(as.mira(qc.fitmetals.imp))
summary(objmetals)

# 汇总权重(增加异常检查)
expnms <- Xnm
wts <- as.data.frame(t(sapply(qc.fitmetals.imp$analyses, 
                             function(x) {
                               # 检查模型结果是否包含权重元素
                               if(!all(c("neg.weights", "pos.weights") %in% names(x))){
                                 stop("模型结果缺失权重元素,请检查模型拟合情况")
                               }
                               c(-x$neg.weights, x$pos.weights)[expnms]
                             })))
eachwt <- do.call(c, wts)
expwts <- data.frame(Exposure = rep(expnms, each=nrow(wts)), Weight=eachwt)

ggplot(data=expwts)+ theme_classic() +
  geom_point(aes(x=Exposure, y=Weight)) +
  geom_hline(aes(yintercept=0))

关键修正点

  1. 新变量生成方式:改用with(imput.metals, ...)直接在mids对象上计算新变量,确保每个插补数据集都有对应的ph2值,避免引入原始带缺失的数据。
  2. 正确调用插补数据集:在lapply循环中通过complete(metals2, action = x)指定第x个插补数据集,保证每个模型使用不同的插补结果。
  3. 增加权重检查:在提取权重时添加检查逻辑,提前发现模型拟合异常导致的权重缺失问题。

关于拟合警告说明

示例数据中出现的glm.fit: fitted probabilities numerically 0 or 1 occurred警告,是因为自变量与因变量关联过强导致模型完全分离,自有数据无此警告说明不存在该问题,无需额外处理。

内容的提问来源于stack exchange,提问作者JDion

火山引擎 最新活动