最近正在进行一项UK Biobank的数据分析研究,整理了一下我可能用到的数据清洗代码,在此存档,以便后续类似的分析工作时可以快速便捷调用。
我的命名习惯
名称 | 属性 |
---|---|
data_participant | 从UK Biobank下载的原始数据 |
smoking | 吸烟数据(若原始数据中没有则补充下载) |
inpatient | 住院数据(包括ICD10诊断和时间)(若原始数据中没有则补充下载) |
merged_data | data_participant清洗之后输出(如有必要)的数据集 |
merge_combine_data | 根据Participant.ID合并后的数据集 |
代码汇总
library(dplyr)
library(tidyr)
library(readxl)
# 读取相关文件----
data_participant <- read.csv("data_participant.csv")
inpatient <- read.csv("inpatient.csv")
smoking <- read.csv("smoking.csv")
# 对data_participant进行数据清洗----
## 合并Instances----
### Baso_Count
# 1. 选择所有Baso_Count相关的列
baso_count_columns <- names(data_participant)[grepl("Basophill.count", names(data_participant))]
# 2. 计算每个Participant.ID的baso_count值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
baso_count_summary <- data_participant %>%
# 将baso列转换为长格式
pivot_longer(cols = all_of(baso_count_columns), names_to = "Measurement", values_to = "Baso_Count") %>%
# 去除缺失值
filter(!is.na(Baso_Count)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Baso_Count = mean(Baso_Count, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将Baso_Count列添加到data_participant中
data_participant <- left_join(data_participant, baso_count_summary, by = "Participant.ID")
### Baso_Per
# 1. 选择所有Baso_Per相关的列
baso_per_columns <- names(data_participant)[grepl("Basophill.percentage", names(data_participant))]
# 2. 计算每个Participant.ID的baso_per值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
baso_per_summary <- data_participant %>%
# 将baso列转换为长格式
pivot_longer(cols = all_of(baso_per_columns), names_to = "Measurement", values_to = "Baso_Per") %>%
# 去除缺失值
filter(!is.na(Baso_Per)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Baso_Per = mean(Baso_Per, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将Baso_Count列添加到data_participant中
data_participant <- left_join(data_participant, baso_per_summary, by = "Participant.ID")
### EOS_Count
# 1. 选择所有EOS_Count相关的列
eos_count_columns <- names(data_participant)[grepl("Eosinophill.count", names(data_participant))]
# 2. 计算每个Participant.ID的eos_count值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
eos_count_summary <- data_participant %>%
# 将eos列转换为长格式
pivot_longer(cols = all_of(eos_count_columns), names_to = "Measurement", values_to = "Eos_Count") %>%
# 去除缺失值
filter(!is.na(Eos_Count)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Eos_Count = mean(Eos_Count, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将Eos_Count列添加到data_participant中
data_participant <- left_join(data_participant, eos_count_summary, by = "Participant.ID")
### EOS_Per
# 1. 选择所有EOS_Per相关的列
eos_per_columns <- names(data_participant)[grepl("Eosinophill.percentage", names(data_participant))]
# 2. 计算每个Participant.ID的eos_per值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
eos_per_summary <- data_participant %>%
# 将eos列转换为长格式
pivot_longer(cols = all_of(eos_per_columns), names_to = "Measurement", values_to = "Eos_Per") %>%
# 去除缺失值
filter(!is.na(Eos_Per)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Eos_Per = mean(Eos_Per, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将Eos_Per列添加到data_participant中
data_participant <- left_join(data_participant, eos_per_summary, by = "Participant.ID")
### HCT
# 1. 选择所有HCT相关的列
HCT_columns <- names(data_participant)[grepl("Haematocrit.percentage", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
HCT_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(HCT_columns), names_to = "Measurement", values_to = "HCT") %>%
# 去除缺失值
filter(!is.na(HCT)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(HCT = mean(HCT, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, HCT_summary, by = "Participant.ID")
### HGB
# 1. 选择所有HGB相关的列
HGB_columns <- names(data_participant)[grepl("Haemoglobin.concentration", names(data_participant))]
# 2. 计算每个Participant.ID的HGB值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
HGB_summary <- data_participant %>%
# 将HGB列转换为长格式
pivot_longer(cols = all_of(HGB_columns), names_to = "Measurement", values_to = "HGB") %>%
# 去除缺失值
filter(!is.na(HGB)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(HGB = mean(HGB, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将HGB列添加到data_participant中
data_participant <- left_join(data_participant, HGB_summary, by = "Participant.ID")
### CRP
# 1. 选择所有CRP相关的列
CRP_columns <- names(data_participant)[grepl("C.reactive.protein", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
CRP_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(CRP_columns), names_to = "Measurement", values_to = "CRP") %>%
# 去除缺失值
filter(!is.na(CRP)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(CRP = mean(CRP, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, CRP_summary, by = "Participant.ID")
### Lym_Count
# 1. 选择所有相关的列
Lym_Count_columns <- names(data_participant)[grepl("Lymphocyte.count", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Lym_Count_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(Lym_Count_columns), names_to = "Measurement", values_to = "Lym_Count") %>%
# 去除缺失值
filter(!is.na(Lym_Count)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Lym_Count = mean(Lym_Count, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Lym_Count_summary, by = "Participant.ID")
### Lym_Per
# 1. 选择所有相关的列
Lym_Per_columns <- names(data_participant)[grepl("Lymphocyte.percentage", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Lym_Per_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(Lym_Per_columns), names_to = "Measurement", values_to = "Lym_Per") %>%
# 去除缺失值
filter(!is.na(Lym_Per)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Lym_Per = mean(Lym_Per, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Lym_Per_summary, by = "Participant.ID")
### Neu_Count
# 1. 选择所有相关的列
Neu_Count_columns <- names(data_participant)[grepl("Neutrophill.count", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Neu_Count_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(Neu_Count_columns), names_to = "Measurement", values_to = "Neu_Count") %>%
# 去除缺失值
filter(!is.na(Neu_Count)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Neu_Count = mean(Neu_Count, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Neu_Count_summary, by = "Participant.ID")
### Neu_Per
# 1. 选择所有相关的列
Neu_Per_columns <- names(data_participant)[grepl("Neutrophill.percentage", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Neu_Per_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(Neu_Per_columns), names_to = "Measurement", values_to = "Neu_Per") %>%
# 去除缺失值
filter(!is.na(Neu_Per)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Neu_Per = mean(Neu_Per, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Neu_Per_summary, by = "Participant.ID")
### WBC
# 1. 选择所有相关的列
WBC_columns <- names(data_participant)[grepl("White.blood.cell..leukocyte..count", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
WBC_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(WBC_columns), names_to = "Measurement", values_to = "WBC") %>%
# 去除缺失值
filter(!is.na(WBC)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(WBC = mean(WBC, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, WBC_summary, by = "Participant.ID")
### PLT
# 1. 选择所有相关的列
PLT_columns <- names(data_participant)[grepl("Platelet.count", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
PLT_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(PLT_columns), names_to = "Measurement", values_to = "PLT") %>%
# 去除缺失值
filter(!is.na(PLT)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(PLT = mean(PLT, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, PLT_summary, by = "Participant.ID")
### Mono_Count
# 1. 选择所有相关的列
Mono_Count_columns <- names(data_participant)[grepl("Monocyte.count", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Mono_Count_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(Mono_Count_columns), names_to = "Measurement", values_to = "Mono_Count") %>%
# 去除缺失值
filter(!is.na(Mono_Count)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Mono_Count = mean(Mono_Count, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Mono_Count_summary, by = "Participant.ID")
### Mono_Per
# 1. 选择所有相关的列
Mono_Per_columns <- names(data_participant)[grepl("Monocyte.percentage", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Mono_Per_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(Mono_Per_columns), names_to = "Measurement", values_to = "Mono_Per") %>%
# 去除缺失值
filter(!is.na(Mono_Per)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(Mono_Per = mean(Mono_Per, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Mono_Per_summary, by = "Participant.ID")
### ALB
# 1. 选择所有相关的列
ALB_columns <- names(data_participant)[grepl("Albumin", names(data_participant))]
# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
ALB_summary <- data_participant %>%
# 将列转换为长格式
pivot_longer(cols = all_of(ALB_columns), names_to = "Measurement", values_to = "ALB") %>%
# 去除缺失值
filter(!is.na(ALB)) %>%
# 按Participant.ID分组,计算每个组的平均值
group_by(Participant.ID) %>%
summarise(ALB = mean(ALB, na.rm = TRUE))
# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, ALB_summary, by = "Participant.ID")
##删除NA值----
data_participant <- data_participant %>%
filter(!is.na(Eos_Count))
data_participant <- data_participant %>%
filter(!is.na(Eos_Per))
data_participant <- data_participant %>%
filter(!is.na(HCT))
data_participant <- data_participant %>%
filter(!is.na(Lym_Count))
data_participant <- data_participant %>%
filter(!is.na(Lym_Per))
data_participant <- data_participant %>%
filter(!is.na(Neu_Count))
data_participant <- data_participant %>%
filter(!is.na(Neu_Per))
data_participant <- data_participant %>%
filter(!is.na(WBC))
data_participant <- data_participant %>%
filter(!is.na(Baso_Per))
data_participant <- data_participant %>%
filter(!is.na(Baso_Count))
data_participant <- data_participant %>%
filter(!is.na(HGB))
data_participant <- data_participant %>%
filter(!is.na(CRP))
data_participant <- data_participant %>%
filter(!is.na(Neu_Count))
data_participant <- data_participant %>%
filter(!is.na(Neu_Per))
data_participant <- data_participant %>%
filter(!is.na(PLT))
data_participant <- data_participant %>%
filter(!is.na(Mono_Count))
data_participant <- data_participant %>%
filter(!is.na(Mono_Per))
data_participant <- data_participant %>%
filter(!is.na(ALB))
## 输出----
write.csv(data_participant, "merged_data.csv")
## 读取----
merged_data <- read.csv("merged_data.csv")
# 对inpatient进行数据清洗----
df <- inpatient
## 拆分诊断列
# 目标疾病代码
target_disease <- c("J96.0", "J96.1", "96.9") #J96:呼吸衰竭
# 将目标疾病代码合并成一个正则表达式
pattern <- paste(target_disease, collapse = "|")
# 使用正则表达式筛选包含目标疾病代码的行
df_filtered <- df %>%
filter(grepl(pattern, p41270))
# 将 p41270 列拆分为多行并生成索引
library(tidyr)
df_long <- df_filtered %>%
separate_rows(p41270, sep = "\\|") %>%
group_by(Participant.ID) %>%
mutate(disease_index = row_number()) %>%
ungroup()
# 获取诊断时间列的名称
time_columns <- grep("^X41280\\.0\\.", names(df), value = TRUE)
# 定义一个函数,用于处理每一批次数据
process_batch <- function(batch_data, original_df, time_columns) {
batch_data <- batch_data %>%
rowwise() %>%
mutate(
# 根据当前行确定诊断时间列的索引
diagnosis_time_col = time_columns[disease_index],
# 使用 match 和列名从原始数据框中提取对应的诊断时间
first_diagnosis_time = ifelse(!is.na(diagnosis_time_col),
original_df[[diagnosis_time_col]][match(Participant.ID, original_df$Participant.ID)],
NA)
) %>%
ungroup()
return(batch_data)
}
# 数据分批处理
batch_size <- 1000 # 每批次处理1000行,可以根据你的计算机性能进行调整
num_batches <- ceiling(nrow(df_long) / batch_size)
# 初始化存储最终结果的数据框
final_df_long <- data.frame()
# 按批次处理数据
for (i in 1:num_batches) {
start_row <- ((i - 1) * batch_size) + 1
end_row <- min(i * batch_size, nrow(df_long))
# 获取当前批次的数据
batch_data <- df_long[start_row:end_row, ]
# 处理当前批次数据
processed_batch <- process_batch(batch_data, df, time_columns)
# 将处理好的批次数据追加到最终数据框
final_df_long <- bind_rows(final_df_long, processed_batch)
# 打印进度信息
print(paste("Processed batch", i, "of", num_batches))
}
# 检查最终结果
head(final_df_long)
# 输出
write.csv(final_df_long, "diagnosis_separated.csv")
# 读取
diagnosis <- read.csv("diagnosis_separated.csv")
# 对smoking进行数据清洗----
# 整理Instance 0至Instance 3的列----
## Current tobacco smoking
smoking <- smoking %>%
mutate(current.tobacco.smoking = case_when(
# 四列中有一列为 "No"
`Current.tobacco.smoking...Instance.0` == "No" |
`Current.tobacco.smoking...Instance.1` == "No" |
`Current.tobacco.smoking...Instance.2` == "No" |
`Current.tobacco.smoking...Instance.3` == "No" ~ "No",
# 四列中有一列为 "Yes, on most or all days"
`Current.tobacco.smoking...Instance.0` == "Yes, on most or all days" |
`Current.tobacco.smoking...Instance.1` == "Yes, on most or all days" |
`Current.tobacco.smoking...Instance.2` == "Yes, on most or all days" |
`Current.tobacco.smoking...Instance.3` == "Yes, on most or all days" ~ "Yes, on most or all days",
# 四列中有一列为 "Only occasionally"
`Current.tobacco.smoking...Instance.0` == "Only occasionally" |
`Current.tobacco.smoking...Instance.1` == "Only occasionally" |
`Current.tobacco.smoking...Instance.2` == "Only occasionally" |
`Current.tobacco.smoking...Instance.3` == "Only occasionally" ~ "Only occasionally",
# 四列中有一列为 "Prefer not to answer"
`Current.tobacco.smoking...Instance.0` == "Prefer not to answer" |
`Current.tobacco.smoking...Instance.1` == "Prefer not to answer" |
`Current.tobacco.smoking...Instance.2` == "Prefer not to answer" |
`Current.tobacco.smoking...Instance.3` == "Prefer not to answer" ~ "Prefer not to answer",
# 默认值
TRUE ~ "Unknown" # 如果没有符合的情况,设为 "Unknown"
))
## Past tobacco smoking
smoking <- smoking %>%
mutate(past.tobacco.smoking = case_when(
# 四列中有一列为 "I have never smoked"
`Past.tobacco.smoking...Instance.0` == "I have never smoked" |
`Past.tobacco.smoking...Instance.1` == "I have never smoked" |
`Past.tobacco.smoking...Instance.2` == "I have never smoked" |
`Past.tobacco.smoking...Instance.3` == "I have never smoked" ~ "I have never smoked",
# 四列中有一列为 Just tried once or twice"
`Past.tobacco.smoking...Instance.0` == "Just tried once or twice" |
`Past.tobacco.smoking...Instance.1` == "Just tried once or twice" |
`Past.tobacco.smoking...Instance.2` == "Just tried once or twice" |
`Past.tobacco.smoking...Instance.3` == "Just tried once or twice" ~ "Just tried once or twice",
# 四列中有一列为 "Smoked occasionally"
`Past.tobacco.smoking...Instance.0` == "Smoked occasionally" |
`Past.tobacco.smoking...Instance.1` == "Smoked occasionally" |
`Past.tobacco.smoking...Instance.2` == "Smoked occasionally" |
`Past.tobacco.smoking...Instance.3` == "Smoked occasionally" ~ "Smoked occasionally",
# 四列中有一列为 "Smoked on most or all days"
`Past.tobacco.smoking...Instance.0` == "Smoked on most or all days" |
`Past.tobacco.smoking...Instance.1` == "Smoked on most or all days" |
`Past.tobacco.smoking...Instance.2` == "Smoked on most or all days" |
`Past.tobacco.smoking...Instance.3` == "Smoked on most or all days" ~ "Smoked on most or all days",
# 四列中有一列为 "Prefer not to answer"
`Past.tobacco.smoking...Instance.0` == "Prefer not to answer" |
`Past.tobacco.smoking...Instance.1` == "Prefer not to answer" |
`Past.tobacco.smoking...Instance.2` == "Prefer not to answer" |
`Past.tobacco.smoking...Instance.3` == "Prefer not to answer" ~ "Prefer not to answer",
# 默认值
TRUE ~ "Unknown" # 如果没有符合的情况,设为 "Unknown"
))
## Ever smoked
smoking <- smoking %>%
mutate(ever.smoked = case_when(
# 四列中有一列为 "No"
`Ever.smoked...Instance.0` == "No" |
`Ever.smoked...Instance.1` == "No" |
`Ever.smoked...Instance.2` == "No" |
`Ever.smoked...Instance.3` == "No" ~ "No",
# 四列中有一列为 "Yes"
`Ever.smoked...Instance.0` == "Yes" |
`Ever.smoked...Instance.1` == "Yes" |
`Ever.smoked...Instance.2` == "Yes" |
`Ever.smoked...Instance.3` == "Yes" ~ "Yes",
# 默认值
TRUE ~ "Unknown" # 如果没有符合的情况,设为 "Unknown"
))
## Smoking.status
smoking <- smoking %>%
mutate(smoking.status = case_when(
# 四列中有一列为 "Current"
`Smoking.status...Instance.0` == "Current" |
`Smoking.status...Instance.1` == "Current" |
`Smoking.status...Instance.2` == "Current" |
`Smoking.status...Instance.3` == "Current" ~ "Current",
# 四列中有一列为 "Previous"
`Smoking.status...Instance.0` == "Previous" |
`Smoking.status...Instance.1` == "Previous" |
`Smoking.status...Instance.2` == "Previous" |
`Smoking.status...Instance.3` == "Previous" ~ "Previous",
# 四列中有一列为 "Never"
`Smoking.status...Instance.0` == "Never" |
`Smoking.status...Instance.1` == "Never" |
`Smoking.status...Instance.2` == "Never" |
`Smoking.status...Instance.3` == "Never" ~ "Never",
# 四列中有一列为 "Prefer not to answer"
`Smoking.status...Instance.0` == "Prefer not to answer" |
`Smoking.status...Instance.1` == "Prefer not to answer" |
`Smoking.status...Instance.2` == "Prefer not to answer" |
`Smoking.status...Instance.3` == "Prefer not to answer" ~ "Prefer not to answer",
# 默认值
TRUE ~ "Unknown" # 如果没有符合的情况,设为 "Unknown"
))
## Light smokers, at least 100 smokes in lifetime
smoking <- smoking %>%
mutate(Light.smokers..at.least.100.smokes.in.lifetime = case_when(
# 四列中有一列为 "Yes"
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.0` == "Yes" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.1` == "Yes" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.2` == "Yes" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.3` == "Yes" ~ "Yes",
# 四列中有一列为 "No"
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.0` == "No" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.1` == "No" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.2` == "No" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.3` == "No" ~ "No",
# 四列中有一列为 "Do not know"
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.0` == "Do not know" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.1` == "Do not know" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.2` == "Do not know" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.3` == "Do not know" ~ "Do not know",
# 四列中有一列为 "Prefer not to answer"
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.0` == "Prefer not to answer" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.1` == "Prefer not to answer" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.2` == "Prefer not to answer" |
`Light.smokers..at.least.100.smokes.in.lifetime...Instance.3` == "Prefer not to answer" ~ "Prefer not to answer",
# 默认值
TRUE ~ "Unknown" # 如果没有符合的情况,设为 "Unknown"
))
## Pack years of smoking
smoking <- smoking %>%
mutate(Pack_years_smoking = coalesce(Pack.years.of.smoking...Instance.0,
Pack.years.of.smoking...Instance.1,
Pack.years.of.smoking...Instance.2,
Pack.years.of.smoking...Instance.3))
# 整理变量“是否吸烟”[剔除Light smokers作为衡量标准]----
smoking <- smoking %>%
mutate(smoking = case_when(
# Yes
`current.tobacco.smoking` == "Yes, on most or all days" |
`current.tobacco.smoking` == "Only occasionally" |
`past.tobacco.smoking` == "Just tried once or twice" |
`past.tobacco.smoking` == "Smoked occasionally" |
`past.tobacco.smoking` == "Smoked on most or all days" |
`smoking.status` == "Yes" |
`ever.smoked` == "Yes" ~ "Yes",
# No
`current.tobacco.smoking` == "No" &
`past.tobacco.smoking` == "I have never smoked" &
`smoking.status` == "Never" &
`ever.smoked` == "No" ~ "No",
TRUE ~ "Unknown" # 如果没有符合的情况,设为 "Unknown"
))
write.csv(smoking, "smoking_clean.csv")
# 手动调整smoking列为Unknown的样本----
## 0.所有吸烟指标均为Unknown 或 Prefer not to answer(1127)
## a. 吸烟指数有明确数值 -> Yes(247)
## b. Current & Past & Ever smoked = No, Smoking status = Current/Previous -> 删除(873)
## d. Current = No, 其余均为Unknown或Prefer not to answer -> No(1540)
## b. Past & Smoking status = No, 其余均为Unknown或Prefer not to answer -> No(36)
## e. Smoking status = Never -> No(36)
# 手动调整smoking列为No的样本----
## a. Current = No, Light smokers = Yes -> 删除(356)
# 手动调整smoking列为Yes的样本----
## a. Current & Past = No, Smoking status = Current/Previous -> 删除(249)
## b. Current = No, Smoking status = Current -> 删除(3781)
## c. Current = No, Light smokers = Yes -> 删除(55197)
# !手动删除Pack_years_smoking 为NA的样本[不能删,因为不吸烟的都是NA!]!----
# 手动删除Smoking = Yes,Pack_years_smoking为NA或0的样本(87970)----
# 手动将smoking = No的吸烟指数调整为0
# 明确COPD的诊断----
diagnosis_separated <- diagnosis_separated %>%
mutate(
COPD = ifelse(grepl("^J44", diagnosis), "Yes", "No")
)
diagnosis_separated <- diagnosis_separated %>%
group_by(Participant.ID) %>%
mutate(
COPD_time = ifelse(
any(COPD == "Yes"),
min(first_diagnosis_time[COPD == "Yes"], na.rm = TRUE),
NA
)
) %>%
ungroup()
# 明确呼衰的诊断----
# 步骤1:创建RF列(仅匹配J96.0/J96.1/J96.9)
diagnosis_separated <- diagnosis_separated %>%
mutate(
RF = ifelse(
grepl("^J96\\.0|^J96\\.1|^J96\\.9", diagnosis),
"Yes",
"No"
)
)
# 步骤2:创建RF_time列(仅针对J96.0/J96.1/J96.9的首次诊断时间)
diagnosis_separated <- diagnosis_separated %>%
group_by(Participant.ID) %>%
mutate(
RF_time = ifelse(
any(RF == "Yes"),
min(first_diagnosis_time[RF == "Yes"], na.rm = TRUE),
NA
)
) %>%
ungroup()
# 步骤3:筛选数据
# 标记需排除的呼吸系统疾病(除COPD外)
diagnosis_separated <- diagnosis_separated %>%
mutate(
# 标记哮喘(J45)、间质性肺病(J84)、肺癌(C34)、肺炎(J12-J18)等
exclude_resp = ifelse(
grepl("^J45|^J84|^C34|^J1[2-8]", diagnosis),
"Yes",
"No"
)
)
# 步骤2:标记终末期疾病(终末期肾病N18.5、肿瘤C00-C97)
diagnosis_separated <- diagnosis_separated %>%
mutate(
# 终末期肾病:N18.5;肿瘤:C00-C97
exclude_terminal = ifelse(
grepl("^N18\\.5|^C", diagnosis),
"Yes",
"No"
)
)
# 步骤3:标记预期寿命不足3年者(终末期肺纤维化J84.1、胰腺癌C25、ALS G12.2等)
diagnosis_separated <- diagnosis_separated %>%
mutate(
exclude_lifespan = ifelse(
grepl("^J84\\.1|^C25|^G12\\.2", diagnosis),
"Yes",
"No"
)
)
# 步骤4:综合筛选(保留不符合任何排除条件的患者)
filtered_data <- diagnosis_separated %>%
filter(
exclude_resp == "No" & # 排除COPD外的呼吸系统疾病[1,3](@ref)
exclude_terminal == "No" & # 排除终末期肾病/肿瘤[6,7](@ref)
exclude_lifespan == "No" # 排除预期寿命<3年者[8,9](@ref)
) %>%
select(-starts_with("exclude")) # 删除临时标记列
diagnosis_separated <- diagnosis_separated %>%
# 删除COPD和RF均为No的患者
filter(!(COPD == "No" & RF == "No")) %>%
# 将时间缺失值设为NA,避免Inf干扰比较
mutate(
COPD_time = ifelse(COPD == "Yes", COPD_time, NA),
RF_time = ifelse(RF == "Yes", RF_time, NA)
) %>%
# 删除RF发生在COPD前的患者(仅当两者均为Yes时比较)
filter(
!(COPD == "Yes" & RF == "Yes" & RF_time < COPD_time)
)
# 输出
write.csv(diagnosis_separated, "real_outcome.csv")
# 数据整合
# 读取三个CSV文件
merged_data <- read.csv("merged_data.csv")
smoking_clean <- read.csv("smoking_clean.csv")
real_outcome <- read.csv("real_outcome.csv")
# 全外连接合并(保留所有行)
merge_combine_data <- merged_data %>%
full_join(smoking_clean, by = "Participant.ID") %>%
full_join(real_outcome, by = "Participant.ID")
评论