3 数据处理与操作
3.0.1 filter 筛选行
dataset %>%
filter(sex == "F", age <= 13) #,=and
dataset %>%
filter(age %in% c(13, 15)) #%in% 同 in
dataset %>%
filter(!(age < 13 | age > 16)) #表示13<=age<=16,|同或,!同非
dataset %>%
filter(!age %in% c(13, 15)) #表示不要13,15行3.0.2 filter 对多列进行筛选行
dataset %>%
filter(if_all(4:6, ~ .x > 75)) #表示筛选出第4-6列中所有列都>75的行
dataset %>%
filter(if_any(where(is.character),is.na)) #从所有字符列中选择任意一列存在NA的行3.0.3 select 选择列删掉列
dataset %>%
select(sex, height, weight)
dataset %>%
select(name:age) #从name到age之间的列
dataset %>%
select(1:3) #选择第1到第3列
dataset %>%
select(-name, -age) #drop name age列
dataset %>%
select(last_col(2)) #选择倒数2列
dataset %>%
select(where(is.numeric)) #选择所有数值列3.0.4 pull 取单个变量作为一个向量
dataset %>%
head(n = 5) %>%
pull(name) #取dataset中name变量的前5行作为一个向量3.0.5 rename 修改列名
dataset %>%
rename(new_name = old_name)3.0.6 mutate 衍生新的列
dataset %>%
mutate(sex = if_else(sex == "男", "M", "F"))
dataset %>%
mutate(grade = case_when(math >= 75 ~ "high",
math >= 60 ~ "middle",
TRUE, ~ "low"))3.0.7 across 对多个列处理
dataset %>%
mutate(across(everything(), as.character)) #将所有列转换为字符
dataset %>%
mutate(across(contains("Length")|contains("Width"), ~ .x*10)) #将列名中含有Length或Width字符的列*103.0.8 arrange 排序
dataset %>%
arrange(desc(col1), col2) #按照col1倒序排序,再按照col2顺序排序,注意缺失值NA排在最后面,不同于SAS3.0.9 distinct 去重
dataset %>%
distinct(col1, col2, .keep_all = TRUE) #.keep_all 是否要保留所有列3.0.10 count 计数
dataset %>%
count(col1) #计算col1各个组别的数量3.0.11 pivot_longer, pivot_wider 转置
dataset %>%
pivot_longer(cols = paste0("header", 1:5),
names_to = "header_label",
values_to = "header")
dataset %>%
pivot_wider(names_from = col1,
values_from = c(col2, col3), #可以设置多个列进行转置
values_fill = 0)3.0.12 left_join, right_join, full_join横向合并数据集
dataset %>%
left_join(dataset1, dataset2, by = c("bycol1", "bycol2"))3.0.13 rbind.fill 纵向合并数据集
dataset %>%
rbind.fill(dataset1, dataset)3.0.14 cumsum, lag 实现有条件的累加计数
dataset %>%
mutate(TiaoJian = x<=lag(x) & y<lag(y),
TiaoJianYN = ifelse(is.na(TiaoJian),FALSE,TiaoJian),
page = cumsum(TiaoJianYN + 0L))