3 数据处理与操作
3.0.1 filter 筛选行
%>%
dataset filter(sex == "F", age <= 13) #,=and
%>%
dataset filter(age %in% c(13, 15)) #%in% 同 in
%>%
dataset filter(!(age < 13 | age > 16)) #表示13<=age<=16,|同或,!同非
%>%
dataset filter(!age %in% c(13, 15)) #表示不要13,15行
3.0.2 filter 对多列进行筛选行
%>%
dataset filter(if_all(4:6, ~ .x > 75)) #表示筛选出第4-6列中所有列都>75的行
%>%
dataset filter(if_any(where(is.character),is.na)) #从所有字符列中选择任意一列存在NA的行
3.0.3 select 选择列删掉列
%>%
dataset select(sex, height, weight)
%>%
dataset select(name:age) #从name到age之间的列
%>%
dataset select(1:3) #选择第1到第3列
%>%
dataset select(-name, -age) #drop name age列
%>%
dataset select(last_col(2)) #选择倒数2列
%>%
dataset select(where(is.numeric)) #选择所有数值列
3.0.4 pull 取单个变量作为一个向量
%>%
dataset head(n = 5) %>%
pull(name) #取dataset中name变量的前5行作为一个向量
3.0.5 rename 修改列名
%>%
dataset rename(new_name = old_name)
3.0.6 mutate 衍生新的列
%>%
dataset mutate(sex = if_else(sex == "男", "M", "F"))
%>%
dataset mutate(grade = case_when(math >= 75 ~ "high",
>= 60 ~ "middle",
math TRUE, ~ "low"))
3.0.7 across 对多个列处理
%>%
dataset mutate(across(everything(), as.character)) #将所有列转换为字符
%>%
dataset mutate(across(contains("Length")|contains("Width"), ~ .x*10)) #将列名中含有Length或Width字符的列*10
3.0.8 arrange 排序
%>%
dataset arrange(desc(col1), col2) #按照col1倒序排序,再按照col2顺序排序,注意缺失值NA排在最后面,不同于SAS
3.0.9 distinct 去重
%>%
dataset distinct(col1, col2, .keep_all = TRUE) #.keep_all 是否要保留所有列
3.0.10 count 计数
%>%
dataset count(col1) #计算col1各个组别的数量
3.0.11 pivot_longer, pivot_wider 转置
%>%
dataset pivot_longer(cols = paste0("header", 1:5),
names_to = "header_label",
values_to = "header")
%>%
dataset pivot_wider(names_from = col1,
values_from = c(col2, col3), #可以设置多个列进行转置
values_fill = 0)
3.0.12 left_join, right_join, full_join横向合并数据集
%>%
dataset left_join(dataset1, dataset2, by = c("bycol1", "bycol2"))
3.0.13 rbind.fill 纵向合并数据集
%>%
dataset rbind.fill(dataset1, dataset)
3.0.14 cumsum, lag 实现有条件的累加计数
%>%
dataset mutate(TiaoJian = x<=lag(x) & y<lag(y),
TiaoJianYN = ifelse(is.na(TiaoJian),FALSE,TiaoJian),
page = cumsum(TiaoJianYN + 0L))