<- c("id", "name", "parent_id", "initial", "initials",
full_name "pinyin", "extra", "suffix", "code", "area_code", "order")
<- "https://raw.githubusercontent.com/eduosi/district/master/district-full.csv"
url_csv <- readr::read_delim(
district url(url_csv),
col_types = cols(.default = "c"),
col_names = F, delim = "\t") %>%
as_tibble() %>%
rename_all(., ~all_of(full_name))
::kable(head(district), caption = "地区区号信息") knitr
1 R函数避坑
1.1 读取文件
1.1.1 如何避免readr对列格式自作主张
默认情况下,readr包读取文件函数会自动猜测数据列的格式,体现在col_types
里。
如下所示,我们使用参数col_types = cols(.default = "c")
,就可以强制读取所有列为character
格式。
1.1.2 如何读取sav数据格式
- 读取spss文件。为了避免出现编码错误(中文等多字节问题),建议使用
foreign::read.spss()
函数,而不宜使用haven::read_sav()
函数
# 此时会有报错
<- here("data/case-3.6-computer-sales.sav")
file_tar tryCatch(df_computer <- haven::read_sav(file_tar,encoding = "UTF-8", .name_repair = "unique"),
warning = function(c) "warning")
# 此时,正常读取和显示
<- here("data/case-3.6-computer-sales.sav")
file_tar <- foreign::read.spss(file_tar, to.data.frame=TRUE) df_computer
re-encoding from CP936
::kable(head(df_computer), align = "c" ) knitr
月份 | 北京 | 长春 | 南京 | 郑州 | 武汉 | 广州 | 成都 | 昆明 | 兰州 | 西安 |
---|---|---|---|---|---|---|---|---|---|---|
1 | 49 | 70 | 76 | 57 | 77 | 72 | 79 | 65 | 51 | 67 |
2 | 41 | 68 | 71 | 57 | 75 | 80 | 83 | 65 | 41 | 67 |
3 | 47 | 50 | 77 | 68 | 81 | 80 | 81 | 58 | 49 | 74 |
4 | 50 | 39 | 72 | 67 | 75 | 84 | 79 | 61 | 46 | 70 |
5 | 55 | 56 | 68 | 63 | 71 | 83 | 75 | 58 | 41 | 58 |
6 | 57 | 54 | 73 | 57 | 74 | 87 | 82 | 72 | 43 | 42 |
1.1.3 如何正确读取json
需要用到的R包:
library("jsonlite")
library("tidyjson")
参看资料:
关键操作步骤在于:
使用
map_if()
函数转换为list多次使用
unnest()
函数。当然,考虑到列变量名的处理问题,可以使用names_repair = tidyr_legacy
进行设定。
#download.file("http://arxiv.org/pdf/1403.2805.pdf", "data-raw/pdf/1403.2805.pdf", mode = "wb")
# Table of contents
<- here("data-raw/pdf/1403.2805.pdf")
file_tar <-pdftools::pdf_toc(file_tar)
toc
# Show as JSON
<- jsonlite::toJSON(toc, auto_unbox = TRUE, pretty = TRUE)
text_json
# json style
<- jsonlite::fromJSON(text_json)
y_list
# pure list
<- map_if(y_list, is.data.frame, list)
y_list_new
# flatten tibble
<- as_tibble(y_list_new) %>%
y_df_new unnest(title,names_repair = tidyr_legacy) %>%
unnest(children, names_repair = tidyr_legacy) %>%
unnest(children, names_repair = tidyr_legacy) %>%
mutate(children = map(children, as_tibble))%>%
unnest(children, names_repair = tidyr_legacy,
keep_empty = T) %>%
select(tidyselect::matches("title\\d{1}", perl =T))
::datatable(
DT
y_df_new,options = list(
dom = "tip",
pageLength = 10)
)
1.2 操作日期对象
1.2.1 数值转日期
经验法则:日期格式转换时,要注意不同环境下的默认初始日期
origin =
。Stata软件默认初始日期为origin = 1960-01-01
;而R环境默认初始日期为origin = 1970-01-01
。
首先我们看如下一段常见的日期对象代码操作:
<- lubridate::as_date("1989-01-01")
start_d <- as_date("2006-12-31")
end_d
<- lubridate::as.duration(interval(start_d,end_d)) %/%
tot_d as.duration(days(1)) +1
<- as.duration(interval(start_d,end_d)) %/%
tot_y as.duration(years(1)) +1
上述结果得到:
- 观测站点运作至少持续
tot_y=
18年,且中间不能终端运行超过2个月。- 总共观测天数长
tot_d=
6574,从start_d=
1989-01-01开始到end_d=
2006-12-31结束。
下面再来看,数值型数据的日期格式转换:
# random select 10 start dates
<- c(12274, 12614, 12717,
start_dates 12804, 13061, 13344,
13712, 14471, 14910,
15678)
## default origin specification
### origin ="1970-01-01"
<- lubridate::as_date(
start_dates1
start_dates,origin =lubridate::origin) #
## custom origin specification
<- lubridate::as_date(
start_dates2
start_dates,origin ="1960-01-01")
我们将看到不同的结果:
- 默认情形下(R环境)
start_dates1
的日期(第一个)为2003-08-10- 指定情形下(STATA环境)
start_dates2
的日期(第一个)为1993-08-09
进一步地:
# option 1
::as_date(0,origin = lubridate::origin) # "1970-01-01" lubridate
[1] "1970-01-01"
# option 2
::as_date(0,origin = "1960-01-01") lubridate
[1] "1960-01-01"
同时,我们还要注意到如下的差异性(并非四舍五入进位!而是相当于ceiling()
形式的进位):
# note the difference
::as_date(10593+4*365,origin ="1960-01-01") lubridate
[1] "1992-12-31"
::as_date(10593+4*365+0.6,origin ="1960-01-01") lubridate
[1] "1992-12-31"
::as_date(10593+4*365+0.4,origin ="1960-01-01") lubridate
[1] "1992-12-31"
最后,R读取stata格式的数据文件,如果.dta文件包含了日期
<- "D:/ProgramData/AER20090377_FinalData.dta"
file_url <- foreign::read.dta(file_url, convert.dates = T) df_raw
此时,函数foreign::read.dta
会按照stata的默认起始日期origin = "1960-01-01"
进行日期格式转换,而不会按照R环境下默认起始日期origin = "1970-01-01"
。
参考资料:
1.2.2 日期转星期几和周次
经验法则:国别文化差异在此体现。
lubridate::wday()
的默认起始日为周日(week_start = 7)
,对于东方文化周一才是第一天,因此要记得设定(week_start = 1)
。
# some sunday
<- as.Date("2022-06-12")
one_sunday # weekday of today
# week_start: 1 = Monday, while default 7=Sunday
## show weekday label
::wday(one_sunday, label = T) lubridate
[1] 周日
Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
## show weekday number with `week_start = 1`
::wday(one_sunday,week_start = 1) lubridate
[1] 7
## show weekday number with `week_start = 7`
::wday(one_sunday,week_start = 7) lubridate
[1] 1
这种区别也同样体现在日期(date)转周次(week)的计算上。
<- as.Date(
check_dates c("2022-04-10", "2022-04-11",
"2022-05-21", "2022-05-22",
"2022-06-12",
"2022-06-19"
)
)
<- tibble(dates = check_dates ) %>%
tbl_date mutate(weekday_lable = lubridate::wday(dates, label = T),
weekday_us = lubridate::wday(dates),
weekday_iso = lubridate::wday(dates, week_start =1),
%>%
) mutate(weeks_us = week(dates),
weeks_iso = isoweek(dates))
1.2.3 日期转字符
<- lubridate::as_datetime("2022-12-16 13:30:00",tz = "Asia/Shanghai")
time_initor
<- 30 # minutes
duration <- strftime(time_initor,format="%H:%M")
time_start
<- strftime(time_initor +60*duration,format="%H:%M")
time_end
cat(paste0("initiate time (CST): ",time_initor,
"\n duration: ", duration, " minutes.",
"\n start hm: ",time_start,
"\n end hm: ",time_end))
initiate time (CST): 2022-12-16 13:30:00
duration: 30 minutes.
start hm: 13:30
end hm: 14:00
1.3 数据框操作
1.3.1 setNames
动态赋予名称
<- c(1, 2)) (demo_vct
[1] 1 2
<- setNames(demo_vct, paste0("V", 1:2))) (demo_vct
V1 V2
1 2
<- list("a", "b", "c")) (demo_list
[[1]]
[1] "a"
[[2]]
[1] "b"
[[3]]
[1] "c"
<- setNames(demo_list, paste0("X", 1:3))) (demo_list
$X1
[1] "a"
$X2
[1] "b"
$X3
[1] "c"
1.3.2 rename_with()
批量重命名列变量
分析场景:对于多列数据框dataframe
(见下图@ref(fig:before-rename-iris)),我们希望进行批量化列重名,实现指定的重命名模式要求(见图@ref(fig:after-rename-iris))。
方法要点:需要使用函数参数为rename_with(.,.fn = ~paste0(., to_app), .cols = all_of(cols))
。
<- iris %>%
tbl_demo select(Petal.Length:Species) %>%
head(10)
::datatable(tbl_demo) DT
<- c("Petal.Length", "Petal.Width")
cols <- "_xxx"
to_app
%>%
tbl_demo rename_with(
.fn = ~paste0(., to_app),
., .cols = all_of(cols)
%>%
) ::datatable() DT
参考资源:
- 队长问答:How to rename selected columns using dplyr with new column names as strings(参看链接)
1.3.3 unnest()
/nest()
操作下前缀或后缀列变量名操作
分析场景:对于list
类型的复合dataframe
(见下图@ref(fig:ls-dt)),希望把它unnest
,同时还保持母list
命名与子dataframe
的命名关系,期望的最终命名效果见图@ref(fig:ls-dt-unnest)。
方法要点:需要使用函数参数为unnest(., names_sep = "_")
。
library(dplyr, warn.conflicts = FALSE)
<- function(x) c(mn = mean(x), sd = sd(x))
msd_c <- function(x) bind_rows(c(mn = mean(x), sd = sd(x)))
msd_df
<- iris %>%
tbl_demo select(Petal.Length:Species) %>%
group_by(Species) %>%
::nest() %>%
tidyrmutate(
Petal.Length = purrr::map(data, ~ msd_df(.$Petal.Length)),
Petal.Width = purrr::map(data, ~ msd_df(.$Petal.Width))
%>%
) select(-data)
::datatable(tbl_demo) DT
<- tbl_demo %>%
tbl_unnest ::unnest(c(Petal.Length, Petal.Width), names_sep = "_")
tidyr
::datatable(tbl_unnest) %>%
DTformatRound(., columns = 2:5, digits = 4)
参考资源:
- 队长问答:tidyr unnest, prefix column names with nested name during unnesting(参看链接)
1.3.4 pivot_longer()
操作中直接使用前缀或后缀列变量名
分析场景:对于有规律列命名(前缀名或后缀名)的数据库dataframe
(见前节的图@ref(fig:ls-dt-unnest)),我们希望把它pivot_longer
,并利用这些前缀名或后缀名关系,期望的效果见图@ref(fig:dt-pivot-name-sep)。
方法要点:需要使用函数参数为pivot_longer(., names_to = c(".value", "stat"), names_sep = "_")
。
<- tbl_unnest %>%
tbl_pivot pivot_longer(
cols = starts_with('Petal'),
names_to = c(".value", "stat"),
names_sep = "_")
%>%
tbl_pivot ::datatable() %>%
DTformatRound(columns = 3:4, digits = 4)
参考资源:
- 队长问答:How to pivot_longer a set of multiple columns? and How to go back from that long format to original wide?(参看链接)
1.4 数据清洗
1.4.1 如何正确生成重复数据rep()
分析场景:生成指定的重复数据,要求指定被重复的vector("char vec")
,以及指定各自重复的次数vector("times vec")
。
- 踩坑函数1:直接
rep()
。
<- c("果汁","矿泉水","绿茶","其他","碳酸饮料")
brand<- c(6L, 10L,11L, 8L,15L)
reps
rep(brand, reps)
[1] "果汁" "果汁" "果汁" "果汁" "果汁" "果汁"
[7] "矿泉水" "矿泉水" "矿泉水" "矿泉水" "矿泉水" "矿泉水"
[13] "矿泉水" "矿泉水" "矿泉水" "矿泉水" "绿茶" "绿茶"
[19] "绿茶" "绿茶" "绿茶" "绿茶" "绿茶" "绿茶"
[25] "绿茶" "绿茶" "绿茶" "其他" "其他" "其他"
[31] "其他" "其他" "其他" "其他" "其他" "碳酸饮料"
[37] "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料"
[43] "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料"
[49] "碳酸饮料" "碳酸饮料"
- 踩坑函数2:
mapply()
+rep()
。这个要复杂一点。
<- mapply(FUN = rep, x=brand, times = reps) %>%
drink unlist()
names(drink) <- NULL
drink
[1] "果汁" "果汁" "果汁" "果汁" "果汁" "果汁"
[7] "矿泉水" "矿泉水" "矿泉水" "矿泉水" "矿泉水" "矿泉水"
[13] "矿泉水" "矿泉水" "矿泉水" "矿泉水" "绿茶" "绿茶"
[19] "绿茶" "绿茶" "绿茶" "绿茶" "绿茶" "绿茶"
[25] "绿茶" "绿茶" "绿茶" "其他" "其他" "其他"
[31] "其他" "其他" "其他" "其他" "其他" "碳酸饮料"
[37] "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料"
[43] "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料"
[49] "碳酸饮料" "碳酸饮料"
1.4.2 如何探查样本数据集是否缺失
- 问题:
lm(formula = lm.mod, data = lm.dt)
回归提示warning。表明数据集lm.dt
中存在缺失值。
12 observation deleted due to missingness
处理:采用函数
stats::complete.cases(lm.dt)
来使用无缺失的数据集。或者查看存在缺失数据的行which(!stats::complete.cases(lm.dt))
。
1.4.3 如何处理的不可见unicode元素
在文本数据清洗过程中,我们经常要处理一些不可见的unicode
元素,例如tab符、换行符、中文空格符等。事实上,这些看不见的unicode
元素要比我们想象到的要多得多(参看)。
U+0009 CHARACTER TABULATION # Tab空格符
U+0020 SPACE
U+00A0 NO-BREAK SPACE (not matched by \s) # 中文空格
U+1680 OGHAM SPACE MARK
U+2000 EN QUAD
U+2001 EM QUAD
U+2002 EN SPACE
U+2003 EM SPACE
U+2004 THREE-PER-EM SPACE
U+2005 FOUR-PER-EM SPACE
U+2006 SIX-PER-EM SPACE
U+2007 FIGURE SPACE
U+2008 PUNCTUATION SPACE
U+2009 THIN SPACE
U+200A HAIR SPACE
U+202F NARROW NO-BREAK SPACE
U+205F MEDIUM MATHEMATICAL SPACE
U+3000 IDEOGRAPHIC SPACE
此时,我们需要识别待处理字符串中的不可见元素都是何种unicode
编码,这时需要用到icove()
函数进行转换。
<- "这是一个\u00a0不可见的中文空格符"
mystr cat("这是一个\u00a0不可见的中文空格符")
这是一个 不可见的中文空格符
# not working
str_replace(mystr, " ","")
[1] "这是一个 不可见的中文空格符"
# detect
iconv(mystr,
from = "UTF-8", to = "ASCII",
sub = "Unicode") # ! important
[1] "<U+8FD9><U+662F><U+4E00><U+4E2A><U+00A0><U+4E0D><U+53EF><U+89C1><U+7684><U+4E2D><U+6587><U+7A7A><U+683C><U+7B26>"
# now it works
str_replace(mystr, "\u00a0","")
[1] "这是一个不可见的中文空格符"
1.4.4 字符串拆分
使用情景:拆分一段字符串(或文本段落),但是保留分隔符。其中,分隔符具有某些信息和价值,可以供后续分析使用。
具体实现的要点:
正确使用正则表达式
regex
。R
环境下一般有两类正则表达式,一类为默认的扩展正则表达式;另一类为Perl
形式的正则表达式(perl = TRUE
)。当然,R环境下也还有一类文学化正则表达式(
fixed = TRUE`)。使用基础包函数
base::strsplit()
下面为代码示例:目标是根据美元符号拆分整个文本段落,并保留分隔符(\$
)。
# minimal example
<- "One way to interpret the CEF $m(x)=\\mathbb{E}[Y \\mid X=x]$ is in terms of how marginal changes in the regressors $X$ imply changes in the conditional expectation of the response variable $Y$."
txt txt
[1] "One way to interpret the CEF $m(x)=\\mathbb{E}[Y \\mid X=x]$ is in terms of how marginal changes in the regressors $X$ imply changes in the conditional expectation of the response variable $Y$."
下列代码的正则表达式中,我们使用了前向搜索表达式(lookahead
)。
# split paragraph
<- strsplit(
split_raw
txt,split = "(?=[\\$])",
perl = TRUE
%>%
) unlist()
split_raw
[1] "One way to interpret the CEF "
[2] "$"
[3] "m(x)=\\mathbb{E}[Y \\mid X=x]"
[4] "$"
[5] " is in terms of how marginal changes in the regressors "
[6] "$"
[7] "X"
[8] "$"
[9] " imply changes in the conditional expectation of the response variable "
[10] "$"
[11] "Y"
[12] "$"
[13] "."
需要注意的是使用stringr::str_split()
则达不到上述效果:
::str_split(string = txt,
stringrpattern = "(?=[\\$])")
[[1]]
[1] "One way to interpret the CEF "
[2] "$m(x)=\\mathbb{E}[Y \\mid X=x]"
[3] "$ is in terms of how marginal changes in the regressors "
[4] "$X"
[5] "$ imply changes in the conditional expectation of the response variable "
[6] "$Y"
[7] "$."
相关资源:
1.4.5 奇偶数位置
使用情景:在文本处理中,数学公式往往以成对的双美元符号定义公式环境(math environment,$$math environment$$
),或者以成对的单美元符号定义公式环境(math environment,$math inline$
)出现。上述成对定位符号,意味着奇数为位置起始,偶数位置为结束。因此,确定奇数/偶数定位符位置是字符串处理的一项重要工作内容。
方法要点:
base::seq_len()
取模运算符
%%2
我们继续使用前述的文本案例(见节@ref(#string-split):字符串拆分)。
# demo txt lines
split_raw
[1] "One way to interpret the CEF "
[2] "$"
[3] "m(x)=\\mathbb{E}[Y \\mid X=x]"
[4] "$"
[5] " is in terms of how marginal changes in the regressors "
[6] "$"
[7] "X"
[8] "$"
[9] " imply changes in the conditional expectation of the response variable "
[10] "$"
[11] "Y"
[12] "$"
[13] "."
# 查找定位符的所在行位置
<- grep("\\$", split_raw)) (lines
[1] 2 4 6 8 10 12
# modal divide
<- seq_len(length(lines))%%2 ) (row_mod
[1] 1 0 1 0 1 0
# start row (indexing odd position)
<- lines[row_mod==1]) (lines_star
[1] 2 6 10
# end row (indexing even position)
<- lines[row_mod==0] ) (lines_end
[1] 4 8 12
当然我们也可以自己设计定制化函数:
<- function(x) x%%2 != 0
odd <- function(x) x%%2 == 0
even <- function(x) !odd(x)
evenb # odd position
odd(1:10)
[1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
# even position
even(1:10)
[1] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
参看资料:
1.5 数据变换
1.5.1 小数位取整与小数位呈现
经验法则:a.你需要在读者的“眼睛”看到的结果与数据“真实”结果之间做出权衡。取整操作
round()
会简化视觉效果,但是必然会损失数据信息。b.大多数情况下,我们都应该慎用取整操作round()
,而采用小数位呈现操作formatC()
。
require(janitor)
require(kableExtra)
载入需要的程序包:kableExtra
载入程序包:'kableExtra'
The following object is masked from 'package:dplyr':
group_rows
<- read_rds(here("data/test-round.rds")) df_test
如下是一份常用的表格输出结果(取整数)。
%>%
df_test mutate(year =as.character(year)) %>%
::adorn_totals() %>%
janitorkable(align = "r",
caption = "kable with row sum")
region_pro | year | vars | value |
---|---|---|---|
非旱区 | 2019 | 教育 | 19673.74 |
非旱区 | 2019 | 科学技术 | 4371.65 |
非旱区 | 2019 | 农林水 | 11655.07 |
Total | - | - | 35700.46 |
眼尖的话,可以看到上表中value
列的加总数35700
,并不等于各行数的直接加总19674+4372+11655 = 35701
。(从出版社编辑的角度来看,这是一个错误!)
我们马上会想到是“真实数据”经过了“四舍五入”,才会出现上述小状况。
下面我们把“真实数据”的小数位展示出来:
%>%
df_test mutate(year =as.character(year)) %>%
::adorn_totals() %>%
janitormutate(value = formatC(value, format='f',digits =2)) %>%
kable(align = "r",
caption = "kable with row sum by format")
region_pro | year | vars | value |
---|---|---|---|
非旱区 | 2019 | 教育 | 19673.74 |
非旱区 | 2019 | 科学技术 | 4371.65 |
非旱区 | 2019 | 农林水 | 11655.07 |
Total | - | - | 35700.46 |
显然,这样加总后的结果完全“符合常识”认知!
那么,在正式发表报告之前,我们到底该如何操作,以“正常地”呈现小数位结果呢?
直接显示两位小数。
# formatC with digits 2
formatC(df_test$value, format="f", digits=2)
[1] "19673.74" "4371.65" "11655.07"
1位小数取整(四舍五入),然后显示两位小数。(请注意观察后两个数的变化差别!!)
# formatC after round function
formatC(round(df_test$value,1), format="f", digits=2)
[1] "19673.70" "4371.60" "11655.10"
2位小数取整(四舍五入),再加总,然后显示两位小数。
# formatC function
formatC(sum(round(df_test$value,2)), format="f", digits=2)
[1] "35700.46"
1位小数取整(四舍五入),再加总,然后显示两位小数。
# formatC function
formatC(sum(round(df_test$value,1)), format="f", digits=2)
[1] "35700.40"
取整(四舍五入),再加总,然后显示两位小数。
# formatC function
formatC(sum(round(df_test$value,0)), format="f", digits=2)
[1] "35701.00"
因此,为了对付“眼尖和刁钻”的出版社编辑或读者们,我们需要先取整,再加总。当然,这番“神操作”之后,你就必然需要面对它带了的一个“副作用”——在连续多轮次取整加总,最后与“真实”浮点数值(float numerical)的差值会越来越大。
%>%
df_test mutate(year =as.character(year)) %>%
mutate(value = round(value)) %>%
::adorn_totals() %>%
janitor#mutate(value = formatC(value, format='f',digits =2)) %>%
kable(align = "r",
caption = "kable seems much normal!")
region_pro | year | vars | value |
---|---|---|---|
非旱区 | 2019 | 教育 | 19674 |
非旱区 | 2019 | 科学技术 | 4372 |
非旱区 | 2019 | 农林水 | 11655 |
Total | - | - | 35701 |
1.5.2 正确进行滞后变量操作
经验法则:
lag()
滞后变换操作时,要注意缺失值的设定,其默认为default = NA
。在需要对滞后变量进行进一步计算变换时,默认缺失值NA
可能会带来一些附带效应。例如4 - NA = NA
。
## default origin specification
### origin ="1970-01-01"
<- lubridate::as_date(
start_dates 12274:12279,
origin =lubridate::origin)
set.seed(123)
<- tibble(date = start_dates,
df_demo value = rnorm(n = length(start_dates),5,2),
cat = rep(c("A","B"),each =3))
kable(df_demo, caption = "模拟的数据")
date | value | cat |
---|---|---|
2003-08-10 | 3.879049 | A |
2003-08-11 | 4.539645 | A |
2003-08-12 | 8.117417 | A |
2003-08-13 | 5.141017 | B |
2003-08-14 | 5.258576 | B |
2003-08-15 | 8.430130 | B |
<- df_demo %>%
df_lag arrange(cat, date) %>%
#group_by(cat) %>%
mutate(cat_lag1 = lag(cat),
cat_lag2 = lag(cat, default = "xx") ) %>%
mutate(detect1 = ifelse(cat != cat_lag1, 1, 0),
detect2 = ifelse(cat != cat_lag2, 1, 0))
%>%
df_lag kable(caption = "lag函数默认缺失值的设定", digits = 2)
date | value | cat | cat_lag1 | cat_lag2 | detect1 | detect2 |
---|---|---|---|---|---|---|
2003-08-10 | 3.88 | A | xx | 1 | ||
2003-08-11 | 4.54 | A | A | A | 0 | 0 |
2003-08-12 | 8.12 | A | A | A | 0 | 0 |
2003-08-13 | 5.14 | B | A | A | 1 | 1 |
2003-08-14 | 5.26 | B | B | B | 0 | 0 |
2003-08-15 | 8.43 | B | B | B | 0 | 0 |
1.5.3 构造多项式列变量
R包stats::poly()
可以构造出matrix
格式的多项式变量(指定p阶),但是这里要基于dplyr
的mutate
管道流(pipe flow),因此要用到少见的!!!
函数。
use
!!!
to splice the columns (turning each element of a list/data.frame into it’s own argument).
data("mtcars")
<- mtcars %>%
df_testmutate(
!!!as.data.frame(
::poly(x = .$wt,
statsdegree = 3,
raw = TRUE)
)%>%
) rename_at(all_of(as.character(1:3)),
~all_of(paste0("wt_p",1:3)))
Warning: Using `all_of()` outside of a selecting function was deprecated in tidyselect
1.2.0.
ℹ See details at
<https://tidyselect.r-lib.org/reference/faq-selection-context.html>
%>%
df_test select(starts_with("wt")) %>%
::datatable(caption = "wt的多项式构造(p=3)",
DToptions = list(dom = "tip", pageLength = 10)) %>%
formatRound(c(1:4), digits = 2)
参考资料:
- dplyr: Using poly function to generate polynomial coefficients (见队长问答)
1.6 数据计算
1.6.1 统计次数
经验法则:如果使用
dplyr::summarise()
结合across()
来进行多列次数的统计,则应该使用sum(!is.na())
,而不应该是n()
。
参考资料:
How to Use na.rm=TRUE with n() While Using Dplyr’s Group_by and Summarise_at 队长问答
Why I love dplyr’s across 博文
<- mtcars %>%
df_fake mutate(carb = ifelse(carb %in% c(3,8), NA, carb))
$gear[c(3,6,9)] <- NA
df_fake
kable(df_fake, caption = "fake data with NA")
mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 1 | |
Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 1 | |
Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 |
Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 2 | |
Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
Merc 280C | 17.8 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.90 | 1 | 0 | 4 | 4 |
Merc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.070 | 17.40 | 0 | 0 | 3 | |
Merc 450SL | 17.3 | 8 | 275.8 | 180 | 3.07 | 3.730 | 17.60 | 0 | 0 | 3 | |
Merc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.780 | 18.00 | 0 | 0 | 3 | |
Cadillac Fleetwood | 10.4 | 8 | 472.0 | 205 | 2.93 | 5.250 | 17.98 | 0 | 0 | 3 | 4 |
Lincoln Continental | 10.4 | 8 | 460.0 | 215 | 3.00 | 5.424 | 17.82 | 0 | 0 | 3 | 4 |
Chrysler Imperial | 14.7 | 8 | 440.0 | 230 | 3.23 | 5.345 | 17.42 | 0 | 0 | 3 | 4 |
Fiat 128 | 32.4 | 4 | 78.7 | 66 | 4.08 | 2.200 | 19.47 | 1 | 1 | 4 | 1 |
Honda Civic | 30.4 | 4 | 75.7 | 52 | 4.93 | 1.615 | 18.52 | 1 | 1 | 4 | 2 |
Toyota Corolla | 33.9 | 4 | 71.1 | 65 | 4.22 | 1.835 | 19.90 | 1 | 1 | 4 | 1 |
Toyota Corona | 21.5 | 4 | 120.1 | 97 | 3.70 | 2.465 | 20.01 | 1 | 0 | 3 | 1 |
Dodge Challenger | 15.5 | 8 | 318.0 | 150 | 2.76 | 3.520 | 16.87 | 0 | 0 | 3 | 2 |
AMC Javelin | 15.2 | 8 | 304.0 | 150 | 3.15 | 3.435 | 17.30 | 0 | 0 | 3 | 2 |
Camaro Z28 | 13.3 | 8 | 350.0 | 245 | 3.73 | 3.840 | 15.41 | 0 | 0 | 3 | 4 |
Pontiac Firebird | 19.2 | 8 | 400.0 | 175 | 3.08 | 3.845 | 17.05 | 0 | 0 | 3 | 2 |
Fiat X1-9 | 27.3 | 4 | 79.0 | 66 | 4.08 | 1.935 | 18.90 | 1 | 1 | 4 | 1 |
Porsche 914-2 | 26.0 | 4 | 120.3 | 91 | 4.43 | 2.140 | 16.70 | 0 | 1 | 5 | 2 |
Lotus Europa | 30.4 | 4 | 95.1 | 113 | 3.77 | 1.513 | 16.90 | 1 | 1 | 5 | 2 |
Ford Pantera L | 15.8 | 8 | 351.0 | 264 | 4.22 | 3.170 | 14.50 | 0 | 1 | 5 | 4 |
Ferrari Dino | 19.7 | 6 | 145.0 | 175 | 3.62 | 2.770 | 15.50 | 0 | 1 | 5 | 6 |
Maserati Bora | 15.0 | 8 | 301.0 | 335 | 3.54 | 3.570 | 14.60 | 0 | 1 | 5 | |
Volvo 142E | 21.4 | 4 | 121.0 | 109 | 4.11 | 2.780 | 18.60 | 1 | 1 | 4 | 2 |
%>%
df_fake group_by(carb) %>%
::summarise(n_carb = n()) %>%
dplyrkable(caption = "summarise with `n()`")
carb | n_carb |
---|---|
1 | 7 |
2 | 10 |
4 | 10 |
6 | 1 |
4 |
# Helper function
<- function(data, numeric_cols = NULL, ...) {
summarizer %>%
data group_by(...) %>%
::summarise(
dplyracross({{numeric_cols}},
list(
count = ~sum(!is.na(.)) # ! do not use `n()`),
),.names = "{col}_{fn}"))
}
%>%
df_fake summarizer(.,
numeric_cols =all_of( c("gear", "cyl")),
%>%
carb) kable(caption = "summarise with `sum()`")
carb | gear_count | cyl_count |
---|---|---|
1 | 5 | 7 |
2 | 9 | 10 |
4 | 10 | 10 |
6 | 1 | 1 |
4 | 4 |
options(knitr.kable.NA = 'NA')
1.6.2 描述性统计
经验法则:在对数据的指定多个列进行描述性统计分析时,
dplyr::summarise()
结合across()
可以大大提升分析效率。
参考资料:
Why I love dplyr’s across 博文
第 40 章 tidyverse中的across()之美1 bookdown’s
# Helper function
<- function(data, numeric_cols = NULL, ...) {
summarizer %>%
data group_by(...) %>%
::summarise(
dplyracross({{numeric_cols}},
list(
count = ~sum(!is.na(.)), # ! do not use `n()`
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
min = ~min(.x, na.rm = TRUE),
max = ~max(.x, na.rm = TRUE)
), .names = "{col}_{fn}"))
}
<- names(df_fake)
var_quanty
<- df_fake %>%
smry_eda summarizer(., numeric_cols = all_of(var_quanty)) %>%
pivot_longer(., contains('_'),
names_to = "variables",
values_to = "value") %>%
separate(., col =variables,
into = c("var", "measure"),
sep = "_" ) %>%
pivot_wider(., names_from = measure, values_from = value) %>%
select(var, count, mean, sd, min, max)
kable(smry_eda,
caption = "描述性统计表",
align = "c")
var | count | mean | sd | min | max |
---|---|---|---|---|---|
mpg | 32 | 20.090625 | 6.0269481 | 10.400 | 33.900 |
cyl | 32 | 6.187500 | 1.7859216 | 4.000 | 8.000 |
disp | 32 | 230.721875 | 123.9386938 | 71.100 | 472.000 |
hp | 32 | 146.687500 | 68.5628685 | 52.000 | 335.000 |
drat | 32 | 3.596563 | 0.5346787 | 2.760 | 4.930 |
wt | 32 | 3.217250 | 0.9784574 | 1.513 | 5.424 |
qsec | 32 | 17.848750 | 1.7869432 | 14.500 | 22.900 |
vs | 32 | 0.437500 | 0.5040161 | 0.000 | 1.000 |
am | 32 | 0.406250 | 0.4989909 | 0.000 | 1.000 |
gear | 29 | 3.689655 | 0.7608007 | 3.000 | 5.000 |
carb | 28 | 2.607143 | 1.3968028 | 1.000 | 6.000 |
1.7 定制化函数
1.7.1 字符串作为变量名
在开发定制化函数过程中,我们可能会用到动态定义变量名,也即将特定字符串(string)定义为动态变量名(variable),然后tidyverse
语境下的mutate()
操作应用。
下面提供了若干动态变量名操作的方法:
<- tibble(
df province = c("陕西","山东","湖南"),
value = rnorm(3),
cat = LETTERS[1:3]
)
kable(df,align = "c", caption = "演示数据")
province | value | cat |
---|---|---|
陕西 | 0.4609162 | A |
山东 | -1.2650612 | B |
湖南 | -0.6868529 | C |
方法1:利用dplyr
包(version >1.0,结合了glue
包的参数调用),结合{{}}
、:=
和get()
。缺点是函数会显得可读性较差!(参看队长问答)
<- "province"
col_tar <- paste0(col_tar,"_raw")
col_media <-3
n<- "value"
col_num # this will work
<- df %>%
out1 # duplicate new named columns
mutate({{col_media}} := get(col_tar)) %>%
mutate("{col_media}.{3}" := value*n) %>%
mutate("mean of {col_num}" := mean(get(col_num)))
# NOTE: this will work in another way unintended
<- df %>%
out2 # duplicate new named columns
mutate({{col_media}} := {{col_tar}})
kable(out1,align = "c", caption = "可行1:{{}}结合:=以及get()")
province | value | cat | province_raw | province_raw.3 | mean of value |
---|---|---|---|---|---|
陕西 | 0.4609162 | A | 陕西 | 1.382749 | -0.4969993 |
山东 | -1.2650612 | B | 山东 | -3.795184 | -0.4969993 |
湖南 | -0.6868529 | C | 湖南 | -2.060559 | -0.4969993 |
kable(out2,align = "c", caption = "不可行2:{{}}结合:=")
province | value | cat | province_raw |
---|---|---|---|
陕西 | 0.4609162 | A | province |
山东 | -1.2650612 | B | province |
湖南 | -0.6868529 | C | province |
方法2:结合使用!!rlang::sym()
和:=
。缺点是代码比较繁琐。
# this will work
<- df %>%
out3 # duplicate new named columns
mutate(!! rlang::sym(col_media) := !! rlang::sym(col_tar))
kable(out3,align = "c", caption = "可行3:结合!!rlang::sym()以及:=")
province | value | cat | province_raw |
---|---|---|---|
陕西 | 0.4609162 | A | 陕西 |
山东 | -1.2650612 | B | 山东 |
湖南 | -0.6868529 | C | 湖南 |
1.7.2 计数counter
<- function() {
task_counter <- 0
i function() {
<<- i + 1
i
i
}
}
<- task_counter()
n_task
# firt
n_task()
[1] 1
# second
n_task()
[1] 2
# clean the counter
<- task_counter()
n_task # firt
n_task()
[1] 1
参考资料:
Hadley “Advanced R”一书中,提到Mutable state
R-blogger 博文 A Little R Counter
1.7.3 排列组合
对于组合数的计算,要区分可重复还是不重复两类情形:
可重复组合数的计算公式为\(n^k\)
不可重复组合数的计算公式为:\(\left(\begin{array}{l}n \\ k\end{array}\right)=\frac{n !}{k !(n-k) !}\)
排列组合问题的R计算可以参看:Learning R: Permutations and Combinations with base R
# combination function
## combinations without repetitions
<- length(combn(1:151,2))/2
n1 <- choose(151,2)
n2
# permutation function
<- function(v) {
perm <- length(v)
n if (n == 1) v
else {
<- NULL
X for (i in 1:n) X <- rbind(X, cbind(v[i], perm(v[-i])))
X
}
}
# nrow(perm(1:2))
1.8 R包janitor
1.8.1 统计制表 vs janitor::tabyl
tabyls: a tidy, fully-featured approach to counting things. see link
(1)优点:
方便统计频次和频率
方便添加行合计
方便调整数据格式(百分数等)
可以结合
knitr::kable()
输出
(2)不足:
janitor::tabyl()
仅适合处理原始数据,并且是data.frame
或vector
。
1.8.2 注意函数使用次序
特别需要注意系列janitor::adorn_xx()
函数的使用先后顺序关系。概括起来需要小心的是:
先计算汇总,再设置格式。其核心要点在于,数据列格式的变换,会导致数据列类型(type)的改变,比如
numeric
类型会变为character
类型。例如,需要先adorn_total("rows")
进行行汇总,再adorn_pct_formatting()
把比率转换为百分数。没有被汇总的
adorn_total()
,也不会被进一步调整格式adorn_pct_formatting()
。这意味着手动计算的占比列(小数如0.32),不能够使用adorn_pct_formatting()
调整为百分数(如32%)。添加汇总计算
adorn_total()
,结果将不再保持为标准数据表(tidy data table)。因此需要额外处理不适合汇总的列或行。例如累积百分比和累积次数,就不应该有汇总(Total)值。
1.8.3 指定列进行汇总或调整格式
首先,指定列进行汇总或调整格式需要满足前述函数使用的先后次序关系。
其次,指定列进行汇总或调整格式需要完整使用
janitor::adorn_xx()
函数的所有参数设置。这一点非常不人性。最后,指定列进行汇总或调整格式需要使用
dplyr
包的select
语法,如all_of(contains(“p”))`。
<-c("非常不满意","不满意","一般","满意","非常满意")
char <- c(24L, 108L,93L, 45L,30L)
reps
<- rep(char, reps)
sat_list
<- data.frame(groups=LETTERS[1:5],
sat satisfication = sat_list,
row.names=NULL) %>%
mutate(satisfication = factor(satisfication, levels = char))
<- sat %>%
cum_calc ::tabyl(satisfication) %>%
janitormutate(min_cum_n = cumsum(n),
min_cum_p = cumsum(percent), #<<
max_cum_n =rev( cumsum(rev(n))), #<<
max_cum_p =rev( cumsum(rev(percent))))
<- cum_calc %>%
cum_formatmutate(min_cum_p = scales::percent(min_cum_p,accuracy = 0.1), #<<
max_cum_p = scales::percent(max_cum_p,accuracy = 0.1)) %>%
::adorn_totals(.,where = "row",fill = "-",na.rm = TRUE, #<<
janitorname = "Total", all_of(c("n","percent"))) %>%
::adorn_pct_formatting(.,digits = 1, #<<
janitorrounding ="half to even",
affix_sign = TRUE,
all_of(c("percent"))) #<<
1.8.4 相关系数与回归系数
require(techme)
载入需要的程序包:techme
data("AgriFertilizer", package = "techme")
names(AgriFertilizer)
[1] "province" "year" "chn_block4" "value" "units"
[6] "variables"
<- AgriFertilizer %>%
df filter(year ==2019) %>%
pivot_wider(id_cols = province, names_from = variables, values_from = value)
lm(data = df, v7_sctj_nyhf_df ~0+v7_sctj_nyhf_lf)
Call:
lm(formula = v7_sctj_nyhf_df ~ 0 + v7_sctj_nyhf_lf, data = df)
Coefficients:
v7_sctj_nyhf_lf
2.811