UK Biobank数据清洗代码

数据库挖掘, 科研相关  ·  2025-03-23

最近正在进行一项UK Biobank的数据分析研究,整理了一下我可能用到的数据清洗代码,在此存档,以便后续类似的分析工作时可以快速便捷调用。

我的命名习惯

名称属性
data_participant从UK Biobank下载的原始数据
smoking吸烟数据(若原始数据中没有则补充下载)
inpatient住院数据(包括ICD10诊断和时间)(若原始数据中没有则补充下载)
merged_datadata_participant清洗之后输出(如有必要)的数据集
merge_combine_data根据Participant.ID合并后的数据集

代码汇总

library(dplyr)
library(tidyr)
library(readxl)

# 读取相关文件----
data_participant <- read.csv("data_participant.csv")
inpatient <- read.csv("inpatient.csv")
smoking <- read.csv("smoking.csv")

# 对data_participant进行数据清洗----
## 合并Instances----
### Baso_Count
# 1. 选择所有Baso_Count相关的列
baso_count_columns <- names(data_participant)[grepl("Basophill.count", names(data_participant))]

# 2. 计算每个Participant.ID的baso_count值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
baso_count_summary <- data_participant %>%
  # 将baso列转换为长格式
  pivot_longer(cols = all_of(baso_count_columns), names_to = "Measurement", values_to = "Baso_Count") %>%
  # 去除缺失值
  filter(!is.na(Baso_Count)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Baso_Count = mean(Baso_Count, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将Baso_Count列添加到data_participant中
data_participant <- left_join(data_participant, baso_count_summary, by = "Participant.ID")

### Baso_Per

# 1. 选择所有Baso_Per相关的列
baso_per_columns <- names(data_participant)[grepl("Basophill.percentage", names(data_participant))]

# 2. 计算每个Participant.ID的baso_per值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
baso_per_summary <- data_participant %>%
  # 将baso列转换为长格式
  pivot_longer(cols = all_of(baso_per_columns), names_to = "Measurement", values_to = "Baso_Per") %>%
  # 去除缺失值
  filter(!is.na(Baso_Per)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Baso_Per = mean(Baso_Per, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将Baso_Count列添加到data_participant中
data_participant <- left_join(data_participant, baso_per_summary, by = "Participant.ID")


### EOS_Count

# 1. 选择所有EOS_Count相关的列
eos_count_columns <- names(data_participant)[grepl("Eosinophill.count", names(data_participant))]

# 2. 计算每个Participant.ID的eos_count值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
eos_count_summary <- data_participant %>%
  # 将eos列转换为长格式
  pivot_longer(cols = all_of(eos_count_columns), names_to = "Measurement", values_to = "Eos_Count") %>%
  # 去除缺失值
  filter(!is.na(Eos_Count)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Eos_Count = mean(Eos_Count, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将Eos_Count列添加到data_participant中
data_participant <- left_join(data_participant, eos_count_summary, by = "Participant.ID")

### EOS_Per

# 1. 选择所有EOS_Per相关的列
eos_per_columns <- names(data_participant)[grepl("Eosinophill.percentage", names(data_participant))]

# 2. 计算每个Participant.ID的eos_per值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
eos_per_summary <- data_participant %>%
  # 将eos列转换为长格式
  pivot_longer(cols = all_of(eos_per_columns), names_to = "Measurement", values_to = "Eos_Per") %>%
  # 去除缺失值
  filter(!is.na(Eos_Per)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Eos_Per = mean(Eos_Per, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将Eos_Per列添加到data_participant中
data_participant <- left_join(data_participant, eos_per_summary, by = "Participant.ID")

### HCT

# 1. 选择所有HCT相关的列
HCT_columns <- names(data_participant)[grepl("Haematocrit.percentage", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
HCT_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(HCT_columns), names_to = "Measurement", values_to = "HCT") %>%
  # 去除缺失值
  filter(!is.na(HCT)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(HCT = mean(HCT, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, HCT_summary, by = "Participant.ID")

### HGB
# 1. 选择所有HGB相关的列
HGB_columns <- names(data_participant)[grepl("Haemoglobin.concentration", names(data_participant))]

# 2. 计算每个Participant.ID的HGB值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
HGB_summary <- data_participant %>%
  # 将HGB列转换为长格式
  pivot_longer(cols = all_of(HGB_columns), names_to = "Measurement", values_to = "HGB") %>%
  # 去除缺失值
  filter(!is.na(HGB)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(HGB = mean(HGB, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将HGB列添加到data_participant中
data_participant <- left_join(data_participant, HGB_summary, by = "Participant.ID")

### CRP

# 1. 选择所有CRP相关的列
CRP_columns <- names(data_participant)[grepl("C.reactive.protein", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
CRP_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(CRP_columns), names_to = "Measurement", values_to = "CRP") %>%
  # 去除缺失值
  filter(!is.na(CRP)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(CRP = mean(CRP, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, CRP_summary, by = "Participant.ID")

### Lym_Count

# 1. 选择所有相关的列
Lym_Count_columns <- names(data_participant)[grepl("Lymphocyte.count", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Lym_Count_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(Lym_Count_columns), names_to = "Measurement", values_to = "Lym_Count") %>%
  # 去除缺失值
  filter(!is.na(Lym_Count)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Lym_Count = mean(Lym_Count, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Lym_Count_summary, by = "Participant.ID")

### Lym_Per

# 1. 选择所有相关的列
Lym_Per_columns <- names(data_participant)[grepl("Lymphocyte.percentage", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Lym_Per_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(Lym_Per_columns), names_to = "Measurement", values_to = "Lym_Per") %>%
  # 去除缺失值
  filter(!is.na(Lym_Per)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Lym_Per = mean(Lym_Per, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Lym_Per_summary, by = "Participant.ID")

### Neu_Count

# 1. 选择所有相关的列
Neu_Count_columns <- names(data_participant)[grepl("Neutrophill.count", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Neu_Count_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(Neu_Count_columns), names_to = "Measurement", values_to = "Neu_Count") %>%
  # 去除缺失值
  filter(!is.na(Neu_Count)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Neu_Count = mean(Neu_Count, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Neu_Count_summary, by = "Participant.ID")

### Neu_Per

# 1. 选择所有相关的列
Neu_Per_columns <- names(data_participant)[grepl("Neutrophill.percentage", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Neu_Per_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(Neu_Per_columns), names_to = "Measurement", values_to = "Neu_Per") %>%
  # 去除缺失值
  filter(!is.na(Neu_Per)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Neu_Per = mean(Neu_Per, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Neu_Per_summary, by = "Participant.ID")

### WBC

# 1. 选择所有相关的列
WBC_columns <- names(data_participant)[grepl("White.blood.cell..leukocyte..count", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
WBC_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(WBC_columns), names_to = "Measurement", values_to = "WBC") %>%
  # 去除缺失值
  filter(!is.na(WBC)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(WBC = mean(WBC, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, WBC_summary, by = "Participant.ID")

### PLT

# 1. 选择所有相关的列
PLT_columns <- names(data_participant)[grepl("Platelet.count", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
PLT_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(PLT_columns), names_to = "Measurement", values_to = "PLT") %>%
  # 去除缺失值
  filter(!is.na(PLT)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(PLT = mean(PLT, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, PLT_summary, by = "Participant.ID")

### Mono_Count

# 1. 选择所有相关的列
Mono_Count_columns <- names(data_participant)[grepl("Monocyte.count", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Mono_Count_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(Mono_Count_columns), names_to = "Measurement", values_to = "Mono_Count") %>%
  # 去除缺失值
  filter(!is.na(Mono_Count)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Mono_Count = mean(Mono_Count, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Mono_Count_summary, by = "Participant.ID")

### Mono_Per

# 1. 选择所有相关的列
Mono_Per_columns <- names(data_participant)[grepl("Monocyte.percentage", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
Mono_Per_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(Mono_Per_columns), names_to = "Measurement", values_to = "Mono_Per") %>%
  # 去除缺失值
  filter(!is.na(Mono_Per)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(Mono_Per = mean(Mono_Per, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, Mono_Per_summary, by = "Participant.ID")

### ALB

# 1. 选择所有相关的列
ALB_columns <- names(data_participant)[grepl("Albumin", names(data_participant))]

# 2. 计算每个Participant.ID的值
# 如果只有一个值,则直接取该值;如果有多个值,则计算平均值
ALB_summary <- data_participant %>%
  # 将列转换为长格式
  pivot_longer(cols = all_of(ALB_columns), names_to = "Measurement", values_to = "ALB") %>%
  # 去除缺失值
  filter(!is.na(ALB)) %>%
  # 按Participant.ID分组,计算每个组的平均值
  group_by(Participant.ID) %>%
  summarise(ALB = mean(ALB, na.rm = TRUE))

# 3. 将计算结果合并回原数据集
# 使用left_join()将列添加到data_participant中
data_participant <- left_join(data_participant, ALB_summary, by = "Participant.ID")

##删除NA值----
data_participant <- data_participant %>%
  filter(!is.na(Eos_Count))
data_participant <- data_participant %>%
  filter(!is.na(Eos_Per))
data_participant <- data_participant %>%
  filter(!is.na(HCT))
data_participant <- data_participant %>%
  filter(!is.na(Lym_Count))
data_participant <- data_participant %>%
  filter(!is.na(Lym_Per))
data_participant <- data_participant %>%
  filter(!is.na(Neu_Count))
data_participant <- data_participant %>%
  filter(!is.na(Neu_Per))
data_participant <- data_participant %>%
  filter(!is.na(WBC))
data_participant <- data_participant %>%
  filter(!is.na(Baso_Per))
data_participant <- data_participant %>%
  filter(!is.na(Baso_Count))
data_participant <- data_participant %>%
  filter(!is.na(HGB))
data_participant <- data_participant %>%
  filter(!is.na(CRP))
data_participant <- data_participant %>%
  filter(!is.na(Neu_Count))
data_participant <- data_participant %>%
  filter(!is.na(Neu_Per))
data_participant <- data_participant %>%
  filter(!is.na(PLT))
data_participant <- data_participant %>%
  filter(!is.na(Mono_Count))
data_participant <- data_participant %>%
  filter(!is.na(Mono_Per))
data_participant <- data_participant %>%
  filter(!is.na(ALB))


## 输出---- 
write.csv(data_participant, "merged_data.csv")
## 读取---- 
merged_data <- read.csv("merged_data.csv")



# 对inpatient进行数据清洗----
df <- inpatient
## 拆分诊断列
# 目标疾病代码
target_disease <- c("J96.0", "J96.1", "96.9") #J96:呼吸衰竭

# 将目标疾病代码合并成一个正则表达式
pattern <- paste(target_disease, collapse = "|")

# 使用正则表达式筛选包含目标疾病代码的行
df_filtered <- df %>%
  filter(grepl(pattern, p41270))

# 将 p41270 列拆分为多行并生成索引
library(tidyr)
df_long <- df_filtered %>%
  separate_rows(p41270, sep = "\\|") %>%
  group_by(Participant.ID) %>%
  mutate(disease_index = row_number()) %>%
  ungroup()

# 获取诊断时间列的名称
time_columns <- grep("^X41280\\.0\\.", names(df), value = TRUE)

# 定义一个函数,用于处理每一批次数据
process_batch <- function(batch_data, original_df, time_columns) {
  
  batch_data <- batch_data %>%
    rowwise() %>%
    mutate(
      # 根据当前行确定诊断时间列的索引
      diagnosis_time_col = time_columns[disease_index],
      
      # 使用 match 和列名从原始数据框中提取对应的诊断时间
      first_diagnosis_time = ifelse(!is.na(diagnosis_time_col),
                                    original_df[[diagnosis_time_col]][match(Participant.ID, original_df$Participant.ID)],
                                    NA)
    ) %>%
    ungroup()
  
  return(batch_data)
}

# 数据分批处理
batch_size <- 1000  # 每批次处理1000行,可以根据你的计算机性能进行调整
num_batches <- ceiling(nrow(df_long) / batch_size)

# 初始化存储最终结果的数据框
final_df_long <- data.frame()

# 按批次处理数据
for (i in 1:num_batches) {
  start_row <- ((i - 1) * batch_size) + 1
  end_row <- min(i * batch_size, nrow(df_long))
  
  # 获取当前批次的数据
  batch_data <- df_long[start_row:end_row, ]
  
  # 处理当前批次数据
  processed_batch <- process_batch(batch_data, df, time_columns)
  
  # 将处理好的批次数据追加到最终数据框
  final_df_long <- bind_rows(final_df_long, processed_batch)
  
  # 打印进度信息
  print(paste("Processed batch", i, "of", num_batches))
}

# 检查最终结果
head(final_df_long)

# 输出
write.csv(final_df_long, "diagnosis_separated.csv")

# 读取
diagnosis <- read.csv("diagnosis_separated.csv")


# 对smoking进行数据清洗----
# 整理Instance 0至Instance 3的列----

## Current tobacco smoking
smoking <- smoking %>%
  mutate(current.tobacco.smoking = case_when(
    # 四列中有一列为 "No"
    `Current.tobacco.smoking...Instance.0` == "No" |
      `Current.tobacco.smoking...Instance.1` == "No" |
      `Current.tobacco.smoking...Instance.2` == "No" |
      `Current.tobacco.smoking...Instance.3` == "No" ~ "No",
    
    # 四列中有一列为 "Yes, on most or all days"
    `Current.tobacco.smoking...Instance.0` == "Yes, on most or all days" |
      `Current.tobacco.smoking...Instance.1` == "Yes, on most or all days" |
      `Current.tobacco.smoking...Instance.2` == "Yes, on most or all days" |
      `Current.tobacco.smoking...Instance.3` == "Yes, on most or all days" ~ "Yes, on most or all days",
    
    # 四列中有一列为 "Only occasionally"
    `Current.tobacco.smoking...Instance.0` == "Only occasionally" |
      `Current.tobacco.smoking...Instance.1` == "Only occasionally" |
      `Current.tobacco.smoking...Instance.2` == "Only occasionally" |
      `Current.tobacco.smoking...Instance.3` == "Only occasionally" ~ "Only occasionally",
    
    # 四列中有一列为 "Prefer not to answer"
    `Current.tobacco.smoking...Instance.0` == "Prefer not to answer" |
      `Current.tobacco.smoking...Instance.1` == "Prefer not to answer" |
      `Current.tobacco.smoking...Instance.2` == "Prefer not to answer" |
      `Current.tobacco.smoking...Instance.3` == "Prefer not to answer" ~ "Prefer not to answer",
    
    # 默认值
    TRUE ~ "Unknown"  # 如果没有符合的情况,设为 "Unknown"
  ))

## Past tobacco smoking
smoking <- smoking %>%
  mutate(past.tobacco.smoking = case_when(
    # 四列中有一列为 "I have never smoked"
    `Past.tobacco.smoking...Instance.0` == "I have never smoked" |
      `Past.tobacco.smoking...Instance.1` == "I have never smoked" |
      `Past.tobacco.smoking...Instance.2` == "I have never smoked" |
      `Past.tobacco.smoking...Instance.3` == "I have never smoked" ~ "I have never smoked",
    
    # 四列中有一列为 Just tried once or twice"
    `Past.tobacco.smoking...Instance.0` == "Just tried once or twice" |
      `Past.tobacco.smoking...Instance.1` == "Just tried once or twice" |
      `Past.tobacco.smoking...Instance.2` == "Just tried once or twice" |
      `Past.tobacco.smoking...Instance.3` == "Just tried once or twice" ~ "Just tried once or twice",
    
    # 四列中有一列为 "Smoked occasionally"
    `Past.tobacco.smoking...Instance.0` == "Smoked occasionally" |
      `Past.tobacco.smoking...Instance.1` == "Smoked occasionally" |
      `Past.tobacco.smoking...Instance.2` == "Smoked occasionally" |
      `Past.tobacco.smoking...Instance.3` == "Smoked occasionally" ~ "Smoked occasionally",
    
    # 四列中有一列为 "Smoked on most or all days"
    `Past.tobacco.smoking...Instance.0` == "Smoked on most or all days" |
      `Past.tobacco.smoking...Instance.1` == "Smoked on most or all days" |
      `Past.tobacco.smoking...Instance.2` == "Smoked on most or all days" |
      `Past.tobacco.smoking...Instance.3` == "Smoked on most or all days" ~ "Smoked on most or all days",
    
    # 四列中有一列为 "Prefer not to answer"
    `Past.tobacco.smoking...Instance.0` == "Prefer not to answer" |
      `Past.tobacco.smoking...Instance.1` == "Prefer not to answer" |
      `Past.tobacco.smoking...Instance.2` == "Prefer not to answer" |
      `Past.tobacco.smoking...Instance.3` == "Prefer not to answer" ~ "Prefer not to answer",
    
    # 默认值
    TRUE ~ "Unknown"  # 如果没有符合的情况,设为 "Unknown"
  ))

## Ever smoked
smoking <- smoking %>%
  mutate(ever.smoked = case_when(
    # 四列中有一列为 "No"
    `Ever.smoked...Instance.0` == "No" |
      `Ever.smoked...Instance.1` == "No" |
      `Ever.smoked...Instance.2` == "No" |
      `Ever.smoked...Instance.3` == "No" ~ "No",
    
    # 四列中有一列为 "Yes"
    `Ever.smoked...Instance.0` == "Yes" |
      `Ever.smoked...Instance.1` == "Yes" |
      `Ever.smoked...Instance.2` == "Yes" |
      `Ever.smoked...Instance.3` == "Yes" ~ "Yes",
    
    # 默认值
    TRUE ~ "Unknown"  # 如果没有符合的情况,设为 "Unknown"
  ))

## Smoking.status
smoking <- smoking %>%
  mutate(smoking.status = case_when(
    # 四列中有一列为 "Current"
    `Smoking.status...Instance.0` == "Current" |
      `Smoking.status...Instance.1` == "Current" |
      `Smoking.status...Instance.2` == "Current" |
      `Smoking.status...Instance.3` == "Current" ~ "Current",
    # 四列中有一列为 "Previous"
    `Smoking.status...Instance.0` == "Previous" |
      `Smoking.status...Instance.1` == "Previous" |
      `Smoking.status...Instance.2` == "Previous" |
      `Smoking.status...Instance.3` == "Previous" ~ "Previous",
    # 四列中有一列为 "Never"
    `Smoking.status...Instance.0` == "Never" |
      `Smoking.status...Instance.1` == "Never" |
      `Smoking.status...Instance.2` == "Never" |
      `Smoking.status...Instance.3` == "Never" ~ "Never",
    # 四列中有一列为 "Prefer not to answer"
    `Smoking.status...Instance.0` == "Prefer not to answer" |
      `Smoking.status...Instance.1` == "Prefer not to answer" |
      `Smoking.status...Instance.2` == "Prefer not to answer" |
      `Smoking.status...Instance.3` == "Prefer not to answer" ~ "Prefer not to answer",
    
    # 默认值
    TRUE ~ "Unknown"  # 如果没有符合的情况,设为 "Unknown"
  ))

## Light smokers, at least 100 smokes in lifetime
smoking <- smoking %>%
  mutate(Light.smokers..at.least.100.smokes.in.lifetime = case_when(
    # 四列中有一列为 "Yes"
    `Light.smokers..at.least.100.smokes.in.lifetime...Instance.0` == "Yes" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.1` == "Yes" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.2` == "Yes" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.3` == "Yes" ~ "Yes",
    # 四列中有一列为 "No"
    `Light.smokers..at.least.100.smokes.in.lifetime...Instance.0` == "No" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.1` == "No" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.2` == "No" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.3` == "No" ~ "No",
    # 四列中有一列为 "Do not know"
    `Light.smokers..at.least.100.smokes.in.lifetime...Instance.0` == "Do not know" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.1` == "Do not know" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.2` == "Do not know" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.3` == "Do not know" ~ "Do not know",
    # 四列中有一列为 "Prefer not to answer"
    `Light.smokers..at.least.100.smokes.in.lifetime...Instance.0` == "Prefer not to answer" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.1` == "Prefer not to answer" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.2` == "Prefer not to answer" |
      `Light.smokers..at.least.100.smokes.in.lifetime...Instance.3` == "Prefer not to answer" ~ "Prefer not to answer",
    
    # 默认值
    TRUE ~ "Unknown"  # 如果没有符合的情况,设为 "Unknown"
  ))

## Pack years of smoking
smoking <- smoking %>%
  mutate(Pack_years_smoking = coalesce(Pack.years.of.smoking...Instance.0, 
                                       Pack.years.of.smoking...Instance.1, 
                                       Pack.years.of.smoking...Instance.2, 
                                       Pack.years.of.smoking...Instance.3))



# 整理变量“是否吸烟”[剔除Light smokers作为衡量标准]----
smoking <- smoking %>%
  mutate(smoking = case_when(
    # Yes
    `current.tobacco.smoking` == "Yes, on most or all days" |
      `current.tobacco.smoking` == "Only occasionally" |
      `past.tobacco.smoking` == "Just tried once or twice" |
      `past.tobacco.smoking` == "Smoked occasionally" |
      `past.tobacco.smoking` == "Smoked on most or all days" |
      `smoking.status` == "Yes" |
      `ever.smoked` == "Yes" ~ "Yes",
    # No
    `current.tobacco.smoking` == "No" &
      `past.tobacco.smoking` == "I have never smoked" &
      `smoking.status` == "Never" &
      `ever.smoked` == "No" ~ "No",
    TRUE ~ "Unknown"  # 如果没有符合的情况,设为 "Unknown"
  ))
write.csv(smoking, "smoking_clean.csv")

# 手动调整smoking列为Unknown的样本----
## 0.所有吸烟指标均为Unknown 或 Prefer not to answer(1127)
## a. 吸烟指数有明确数值 -> Yes(247)
## b. Current & Past & Ever smoked = No, Smoking status = Current/Previous -> 删除(873)
## d. Current = No, 其余均为Unknown或Prefer not to answer -> No(1540)
## b. Past & Smoking status = No, 其余均为Unknown或Prefer not to answer -> No(36)
## e. Smoking status = Never -> No(36)

# 手动调整smoking列为No的样本----
## a. Current = No, Light smokers = Yes -> 删除(356)

# 手动调整smoking列为Yes的样本----
## a. Current & Past = No, Smoking status = Current/Previous -> 删除(249)
## b. Current = No, Smoking status = Current -> 删除(3781)
## c. Current = No, Light smokers = Yes -> 删除(55197)

# !手动删除Pack_years_smoking 为NA的样本[不能删,因为不吸烟的都是NA!]!----
# 手动删除Smoking = Yes,Pack_years_smoking为NA或0的样本(87970)----
# 手动将smoking = No的吸烟指数调整为0

# 明确COPD的诊断----
diagnosis_separated <- diagnosis_separated %>%
  mutate(
    COPD = ifelse(grepl("^J44", diagnosis), "Yes", "No")
  )

diagnosis_separated <- diagnosis_separated %>%
  group_by(Participant.ID) %>%
  mutate(
    COPD_time = ifelse(
      any(COPD == "Yes"), 
      min(first_diagnosis_time[COPD == "Yes"], na.rm = TRUE), 
      NA
    )
  ) %>%
  ungroup()

# 明确呼衰的诊断----

# 步骤1:创建RF列(仅匹配J96.0/J96.1/J96.9)
diagnosis_separated <- diagnosis_separated %>%
  mutate(
    RF = ifelse(
      grepl("^J96\\.0|^J96\\.1|^J96\\.9", diagnosis), 
      "Yes", 
      "No"
    )
  )

# 步骤2:创建RF_time列(仅针对J96.0/J96.1/J96.9的首次诊断时间)
diagnosis_separated <- diagnosis_separated %>%
  group_by(Participant.ID) %>%
  mutate(
    RF_time = ifelse(
      any(RF == "Yes"),
      min(first_diagnosis_time[RF == "Yes"], na.rm = TRUE),
      NA
    )
  ) %>%
  ungroup()

# 步骤3:筛选数据
# 标记需排除的呼吸系统疾病(除COPD外)
diagnosis_separated <- diagnosis_separated %>%
  mutate(
    # 标记哮喘(J45)、间质性肺病(J84)、肺癌(C34)、肺炎(J12-J18)等
    exclude_resp = ifelse(
      grepl("^J45|^J84|^C34|^J1[2-8]", diagnosis), 
      "Yes", 
      "No"
    )
  )

# 步骤2:标记终末期疾病(终末期肾病N18.5、肿瘤C00-C97)
diagnosis_separated <- diagnosis_separated %>%
  mutate(
    # 终末期肾病:N18.5;肿瘤:C00-C97
    exclude_terminal = ifelse(
      grepl("^N18\\.5|^C", diagnosis), 
      "Yes", 
      "No"
    )
  )

# 步骤3:标记预期寿命不足3年者(终末期肺纤维化J84.1、胰腺癌C25、ALS G12.2等)
diagnosis_separated <- diagnosis_separated %>%
  mutate(
    exclude_lifespan = ifelse(
      grepl("^J84\\.1|^C25|^G12\\.2", diagnosis), 
      "Yes", 
      "No"
    )
  )

# 步骤4:综合筛选(保留不符合任何排除条件的患者)
filtered_data <- diagnosis_separated %>%
  filter(
    exclude_resp == "No" &   # 排除COPD外的呼吸系统疾病[1,3](@ref)
      exclude_terminal == "No" &  # 排除终末期肾病/肿瘤[6,7](@ref)
      exclude_lifespan == "No"    # 排除预期寿命<3年者[8,9](@ref)
  ) %>%
  select(-starts_with("exclude"))  # 删除临时标记列

diagnosis_separated <- diagnosis_separated %>%
  # 删除COPD和RF均为No的患者
  filter(!(COPD == "No" & RF == "No")) %>%
  # 将时间缺失值设为NA,避免Inf干扰比较
  mutate(
    COPD_time = ifelse(COPD == "Yes", COPD_time, NA),
    RF_time = ifelse(RF == "Yes", RF_time, NA)
  ) %>%
  # 删除RF发生在COPD前的患者(仅当两者均为Yes时比较)
  filter(
    !(COPD == "Yes" & RF == "Yes" & RF_time < COPD_time)
  )

# 输出
write.csv(diagnosis_separated, "real_outcome.csv")

# 数据整合

# 读取三个CSV文件
merged_data <- read.csv("merged_data.csv")
smoking_clean <- read.csv("smoking_clean.csv")
real_outcome <- read.csv("real_outcome.csv")

# 全外连接合并(保留所有行)
merge_combine_data <- merged_data %>%
  full_join(smoking_clean, by = "Participant.ID") %>%
  full_join(real_outcome, by = "Participant.ID")
评论
随想录. All Rights Reserved. Theme Jasmine by Kent Liao.