3  数据处理与操作

3.0.1 filter 筛选行

dataset %>%    
  filter(sex == "F", age <= 13) #,=and  
dataset %>%    
  filter(age %in% c(13, 15)) #%in% 同 in  
dataset %>%    
  filter(!(age < 13 | age > 16)) #表示13<=age<=16,|同或,!同非 
dataset %>%    
  filter(!age %in% c(13, 15)) #表示不要13,15行

3.0.2 filter 对多列进行筛选行

dataset %>%    
  filter(if_all(4:6, ~ .x > 75)) #表示筛选出第4-6列中所有列都>75的行
dataset %>% 
  filter(if_any(where(is.character),is.na)) #从所有字符列中选择任意一列存在NA的行

3.0.3 select 选择列删掉列

dataset %>% 
  select(sex, height, weight)

dataset %>% 
  select(name:age) #从name到age之间的列

dataset %>% 
  select(1:3) #选择第1到第3列

dataset %>% 
  select(-name, -age) #drop name age列

dataset %>% 
  select(last_col(2)) #选择倒数2列

dataset %>% 
  select(where(is.numeric)) #选择所有数值列

3.0.4 pull 取单个变量作为一个向量

dataset %>% 
  head(n = 5) %>% 
  pull(name)  #取dataset中name变量的前5行作为一个向量

3.0.5 rename 修改列名

dataset %>% 
  rename(new_name = old_name)

3.0.6 mutate 衍生新的列

dataset %>% 
  mutate(sex = if_else(sex == "男", "M", "F"))

dataset %>% 
  mutate(grade = case_when(math >= 75 ~ "high",
                           math >= 60 ~ "middle",
                           TRUE, ~ "low"))

3.0.7 across 对多个列处理

dataset %>% 
  mutate(across(everything(), as.character)) #将所有列转换为字符

dataset %>% 
  mutate(across(contains("Length")|contains("Width"), ~ .x*10)) #将列名中含有Length或Width字符的列*10

3.0.8 arrange 排序

dataset %>% 
  arrange(desc(col1), col2) #按照col1倒序排序,再按照col2顺序排序,注意缺失值NA排在最后面,不同于SAS

3.0.9 distinct 去重

dataset %>% 
  distinct(col1, col2, .keep_all = TRUE) #.keep_all 是否要保留所有列

3.0.10 count 计数

dataset %>% 
  count(col1) #计算col1各个组别的数量

3.0.11 pivot_longer, pivot_wider 转置

dataset %>% 
  pivot_longer(cols = paste0("header", 1:5),
               names_to = "header_label",
               values_to = "header")

dataset %>% 
  pivot_wider(names_from = col1,
              values_from = c(col2, col3), #可以设置多个列进行转置
              values_fill = 0)

3.0.12 left_join, right_join, full_join横向合并数据集

dataset %>% 
  left_join(dataset1, dataset2, by = c("bycol1", "bycol2"))

3.0.13 rbind.fill 纵向合并数据集

dataset %>% 
  rbind.fill(dataset1, dataset)

3.0.14 cumsum, lag 实现有条件的累加计数

dataset %>% 
  mutate(TiaoJian = x<=lag(x) & y<lag(y),
         TiaoJianYN = ifelse(is.na(TiaoJian),FALSE,TiaoJian),
         page = cumsum(TiaoJianYN + 0L))