go语言csv包_玩转数据处理120题R语言版本
點(diǎn)擊上方“早起Python”,關(guān)注并星標(biāo)公眾號(hào)
和我一起玩Python
本文為玩轉(zhuǎn)數(shù)據(jù)處理120題|R語(yǔ)言版本
習(xí)題|劉早起,解答|陳熹
大家好,本文為R語(yǔ)言數(shù)據(jù)處理120題系列完整版本。作者精心挑選120道數(shù)據(jù)處理中相關(guān)操作以習(xí)題形式發(fā)布,一共涵蓋了數(shù)據(jù)處理、計(jì)算、可視化等常用操作,并對(duì)部分題目給出了多種解法與注解。動(dòng)手敲一遍代碼一定會(huì)讓你有所收獲!
1創(chuàng)建DataFrame題目:將下面的字典創(chuàng)建為DataFramedata = {"grammer":["Python","C","Java","GO",np.nan,"SQL","PHP","Python"],???????"score":[1,2,np.nan,4,5,6,7,10]}難度:?期望結(jié)果R語(yǔ)言解法# R中沒(méi)有字典概念,故直接創(chuàng)建dataframe/tibble
#> 第一種
df???"grammer"?= c("Python","C","Java","GO",NA,"SQL","PHP","Python"),
??"score"?= c(1,2,NA,4,5,6,7,10)
)
#> 第二種
library(tibble)
df ??"grammer"?= c("Python","C","Java","GO",NA,"SQL","PHP","Python"),
??"score"?= c(1,2,NA,4,5,6,7,10)
)
# 也可以用tribble橫向建tibble注:1-20題均基于該數(shù)據(jù)框給出?2數(shù)據(jù)提取題目:提取含有字符串"Python"的行難度:??期望結(jié)果grammer??score
0?Python????1.0
7?Python???10.0
R語(yǔ)言解法
df[which(df$grammer?== 'Python'),]3提取列名題目:輸出df的所有列名難度:?期望結(jié)果Index(['grammer',?'score'], dtype='object')R語(yǔ)言解法
names(df)# [1] "grammer" "score"4修改列名題目:修改第二列列名為'popularity'難度:??R語(yǔ)言解法df %
??rename(popularity = score)5字符統(tǒng)計(jì)題目:統(tǒng)計(jì)grammer列中每種編程語(yǔ)言出現(xiàn)的次數(shù)難度:??R語(yǔ)言解法# 神方法table
table(df$grammer)6缺失值處理題目:將空值用上下值的平均值填充難度:???上下兩數(shù)的均值
df['popularity'] = df['popularity'].fillna(df['popularity'].interpolate())
R語(yǔ)言解法
library(Hmisc)index?is.na(df$popularity))
df$popularity ???????????????????????(unlist(df[index-1, 2] +
???????????????????????????????df[index+1, 2]))/2)7數(shù)據(jù)提取題目:提取popularity列中值大于3的行難度:??R語(yǔ)言解法df %>%
??filter(popularity > 3)
# 等價(jià)于
df[df$popularity > 3,] # 這種方法跟pandas很相似8數(shù)據(jù)去重題目:按照grammer列進(jìn)行去重難度:??R語(yǔ)言解法df[!duplicated(df$grammer),]9數(shù)據(jù)計(jì)算題目:計(jì)算popularity列平均值難度:??R語(yǔ)言解法#> 第一種
mean(df$popularity)
# [1] 4.75
#> 第二種
df %>%
??summarise(mean = mean(popularity))
## A tibble: 1 x 1
# mean
#
# 1 4.7510格式轉(zhuǎn)換題目:將grammer列轉(zhuǎn)換為list難度:??R解法unlist(df$grammer)
# [1] "Python" "C" "Java" "GO" NA "SQL" "PHP" "Python"11數(shù)據(jù)保存題目:將DataFrame保存為EXCEL難度:??R解法#R對(duì)EXCEL文件不太友好
#第一種方法:利用readr包轉(zhuǎn)為csv再用EXCEL打開(kāi)
#文件本質(zhì)依然是csv
library(readr)
write_excel_csv(df,'filename.csv')
#第二種方法:利用openxlsx包
openxlsx::write.xlsx(df,'filename.xlsx')
#也可以用xlsx包,但需要先配置JAVA環(huán)境
#確保JAVA配置到環(huán)境變量中并命名為JAVA_HOME
Sys.getenv("JAVA_HOME")
install.packages('rJava')
install.packages("xlsxjars")
library(rJava)
library(xlsxjars)
xlsx::write.xlsx(df,'filename.xlsx')12數(shù)據(jù)查看題目:查看數(shù)據(jù)行列數(shù)難度:?R解法dim(df)
# [1] 8 213數(shù)據(jù)提取題目:提取popularity列值大于3小于7的行難度:??R解法library(dplyr)
df %>%
??filter(popularity > 3?& popularity <7)
# 等價(jià)于
df[(df$popularity > 3) & (df$popularity <7),]14位置處理題目:交換兩列位置難度:???R解法df?%
????select(popularity,everything())15數(shù)據(jù)提取題目:提取popularity列最大值所在行難度:??R解法df %>%
??filter(popularity == max(popularity))
# 同理也有類似pandas的方法
df[df$popularity == max(df$popularity),]16數(shù)據(jù)查看題目:查看最后5行數(shù)據(jù)難度:?R解法# R中head和tail默認(rèn)是6行,可以指定數(shù)字
tail(df,5)17數(shù)據(jù)修改題目:刪除最后一行數(shù)據(jù)難度:?R解法df[-dim(df)[1],]
# 等價(jià)于
df %>%
??filter(rownames(df) != max(rownames(df)))18數(shù)據(jù)修改題目:添加一行數(shù)據(jù)['Perl',6.6]難度:??R解法row 6.6,'Perl') # 需要和列的位置對(duì)應(yīng)
# 或者建數(shù)據(jù)框
row ??"grammer"?= c("Perl"),
??"popularity"?= c(6.6)
)
df 19數(shù)據(jù)整理題目:對(duì)數(shù)據(jù)按照"popularity"列值的大小進(jìn)行排序難度:??R解法df?%
??arrange(popularity)20字符統(tǒng)計(jì)題目:統(tǒng)計(jì)grammer列每個(gè)字符串的長(zhǎng)度難度:???R解法library(Hmisc)
library(stringr)
df$grammer $grammer,'R')
str_length(df$grammer)
df$len_str $grammer)
第二期:數(shù)據(jù)處理基礎(chǔ)
21數(shù)據(jù)讀取題目:讀取本地EXCEL數(shù)據(jù)難度:?R解法#R語(yǔ)言處理excel不友好,直接讀取日期時(shí)間數(shù)據(jù)會(huì)變成實(shí)數(shù)#openxlsx::read.xlsx中的detectDates參數(shù)只能識(shí)別純?nèi)掌?br />#as.Data轉(zhuǎn)換該列后時(shí)間數(shù)據(jù)丟失,只有日期
#故先把excel文件轉(zhuǎn)存為csv后用readr包讀取
# 該方法不理想
library(openxlsx)
df 'pandas120.xlsx',detectDates = T)
df$createTime as.Date(df$createTime,origin="1900-01-01")
# 轉(zhuǎn)存csv后再讀
library(readr)
df 'pandas120.csv')21—50部分習(xí)題與該數(shù)據(jù)相關(guān)22數(shù)據(jù)查看題目:查看df數(shù)據(jù)前5行難度:?期望輸出R解法# 默認(rèn)是6行,可指定行數(shù)
head(df,5)23數(shù)據(jù)計(jì)算題目:將salary列數(shù)據(jù)轉(zhuǎn)換為最大值與最小值的平均值難度:????期望輸出R解法library(stringr)
df$salary %
??str_replace_all('k','') %>%
??str_split('-',simplify = T) %>%
??apply(2,as.numeric) %>%
??rowMeans() * 100024數(shù)據(jù)分組題目:將數(shù)據(jù)根據(jù)學(xué)歷進(jìn)行分組并計(jì)算平均薪資難度:???期望輸出education?salary?? ? ? ? ? ?
不限 19600.000000
大專 10000.000000
本科 19361.344538
碩士 20642.857143R解法df?%>%
??group_by(education) %>%
??summarise(mean = mean(salary))25時(shí)間轉(zhuǎn)換題目:將createTime列時(shí)間轉(zhuǎn)換為月-日難度:???期望輸出R解法#轉(zhuǎn)化后該列屬性是 字符串,R中對(duì)時(shí)間格式要求嚴(yán)格
df$createTime as.Date(df$createTime) %>%
??str_replace('2020-','')26數(shù)據(jù)查看題目:查看索引、數(shù)據(jù)類型和內(nèi)存信息難度:?期望輸出<class?'pandas.core.frame.DataFrame'>
RangeIndex:?135?entries,?0?to?134
Data columns (total?4?columns):
createTime?135?non-null?object
education?135?non-null?object
salary?135?non-null?int64
categories?135?non-null?category
dtypes: category(1), int64(1), object(2)
memory usage:?3.5+ KBR解法str(df)
# 內(nèi)存查看需要用到其他的庫(kù)
library(pryr)
object_size(df)
# 6.66 kB27數(shù)據(jù)查看題目:查看數(shù)值型列的匯總統(tǒng)計(jì)難度:?R解法summary(df)28數(shù)據(jù)整理題目:新增一列根據(jù)salary將數(shù)據(jù)分為三組難度:????輸入期望輸出R解法#用ifelse也可以
#底層原理有差別但實(shí)現(xiàn)結(jié)果一樣
df %
??mutate(categories = case_when(
????salary >= 0?& salary < 5000?~ '低',
????salary >= 5000?& salary < 20000?~ '低',
????TRUE ~ '高'
??))29數(shù)據(jù)整理題目:按照salary列對(duì)數(shù)據(jù)降序排列難度:??R解法df?%>%
??arrange(desc(salary))30數(shù)據(jù)提取題目:取出第33行數(shù)據(jù)難度:??R解法df[33,]31數(shù)據(jù)計(jì)算題目:計(jì)算salary列的中位數(shù)難度:??R解法median(df$salary)
# [1] 1750032數(shù)據(jù)可視化題目:繪制薪資水平頻率分布直方圖難度:???期望輸出R解法library(ggplot2)
library(patchwork)
df?%>%
??ggplot(aes(salary)) +
??geom_histogram() +
??df?%>%
??ggplot(aes(salary)) +
??geom_histogram(bins = 10) # 這個(gè)跟python的bins一致33數(shù)據(jù)可視化題目:繪制薪資水平密度曲線難度:???期望輸出R解法df?%>%
??ggplot(aes(salary)) +
??geom_density() +
??xlim(c(0,70000))34數(shù)據(jù)刪除題目:刪除最后一列categories難度:?R解法df?4]
# 提高可讀性可采用如下代碼
df %
??select(-c('categories'))35數(shù)據(jù)處理題目:將df的第一列與第二列合并為新的一列難度:??R解法df %
??mutate(test?= paste0(df$education,df$createTime))36數(shù)據(jù)處理題目:將education列與salary列合并為新的一列難度:???備注:salary為int類型,操作與35題有所不同R解法df?%
??mutate(test1 =
???????????paste0(df$salary,df$education))37數(shù)據(jù)計(jì)算題目:計(jì)算salary最大值與最小值之差難度:???R解法df %>%
??summarise(delta = max(salary) - min(salary)) %>%
??unlist()
# delta
# 4150038數(shù)據(jù)處理題目:將第一行與最后一行拼接難度:??R解法rbind(df[1,],df[dim(df)[1],])39數(shù)據(jù)處理題目:將第8行數(shù)據(jù)添加至末尾難度:??R解法rbind(df,df[8,])40數(shù)據(jù)查看題目:查看每列的數(shù)據(jù)類型難度:?期望結(jié)果createTime?object
education?object
salary int64
test?object
test1?object
dtype:?objectR解法str(df)
# tibble [135 x 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
# $ createTime: chr [1:135] "03-16" "03-16" "03-16" "03-16" ...
# $ education : chr [1:135] "本科" "本科" "不限" "本科" ...
# $ salary : num [1:135] 27500 30000 27500 16500 15000 14000 23000 12500 7000 16000 ...
# $ test : chr [1:135] "本科03-16" "本科03-16" "不限03-16" "本科03-16" ...
# $ test1 : chr [1:135] "27500本科" "30000本科" "27500不限" "16500本科" ...41數(shù)據(jù)處理題目:將createTime列設(shè)置為索引難度:??R解法df %>%
??tibble::column_to_rownames('createTime')42數(shù)據(jù)創(chuàng)建題目:生成一個(gè)和df長(zhǎng)度相同的隨機(jī)數(shù)dataframe難度:??R解法df1 135,function(n) {
??replicate(n,sample(1:10,1))
})
# 列名暫時(shí)不一樣,下一題重命名43數(shù)據(jù)處理題目:將上一題生成的dataframe與df合并難度:??R解法df %
??rename(`0` = df1)
# 非常規(guī)命名需要用``包裹變量名44數(shù)據(jù)計(jì)算題目:生成新的一列new為salary列減去之前生成隨機(jī)數(shù)列難度:??R解法df %
??mutate(new?= salary - `0`)45缺失值處理題目:檢查數(shù)據(jù)中是否含有任何缺失值難度:???R解法# 這個(gè)包的結(jié)果呈現(xiàn)非常有趣
library(mice)
md.pattern(df)46數(shù)據(jù)轉(zhuǎn)換題目:將salary列類型轉(zhuǎn)換為浮點(diǎn)數(shù)難度:???R解法as.double(df2$salary)47數(shù)據(jù)計(jì)算題目:計(jì)算salary大于10000的次數(shù)難度:??R解法df?%>%
??filter(salary > 10000) %>%
??dim(.) %>%
??.[1]
48數(shù)據(jù)統(tǒng)計(jì)題目:查看每種學(xué)歷出現(xiàn)的次數(shù)難度:???期望輸出本科?119
碩士?7
不限?5
大專?4
Name: education, dtype:?int64R解法table(df$education)49數(shù)據(jù)查看題目:查看education列共有幾種學(xué)歷難度:??R解法length(unique(df$education))
# [1] 450數(shù)據(jù)提取題目:提取salary與new列的和大于60000的最后3行難度:????期望輸出R解法df[df$salary + df$new?> 60000,] %>%
??.[nrow(.)-3+1:nrow(.),] %>%
??na.omit(.)51數(shù)據(jù)讀取題目:使用絕對(duì)路徑讀取本地Excel數(shù)據(jù)難度:?R解法# 轉(zhuǎn)存csv后再讀
library(readr)
df 'C:/Users/chenx/Documents/Data Analysis/Pandas51-80.csv')
備注
請(qǐng)將答案中路徑替換為自己機(jī)器存儲(chǔ)數(shù)據(jù)的絕對(duì)路徑,51—80相關(guān)習(xí)題與該數(shù)據(jù)有關(guān)
52數(shù)據(jù)查看題目:查看數(shù)據(jù)前三行難度:?期望結(jié)果R解法head(df,3)53缺失值處理題目:查看每列數(shù)據(jù)缺失值情況難度:??期望結(jié)果代碼?1簡(jiǎn)稱?2
日期?2
前收盤價(jià)(元)?2
開(kāi)盤價(jià)(元)?2
最高價(jià)(元)?2
最低價(jià)(元)?2
收盤價(jià)(元)?2
成交量(股)?2
成交金額(元)?2
.................R解法colSums(is.na(df))54缺失值處理題目:提取日期列含有空值的行難度:??期望結(jié)果R解法df[is.na(df$日期),]55缺失值處理題目:輸出每列缺失值具體行數(shù)難度:???期望結(jié)果列名:"代碼", 第[327]行位置有缺失值
列名:"簡(jiǎn)稱", 第[327, 328]行位置有缺失值
列名:"日期", 第[327, 328]行位置有缺失值
列名:"前收盤價(jià)(元)", 第[327, 328]行位置有缺失值
列名:"開(kāi)盤價(jià)(元)", 第[327, 328]行位置有缺失值
列名:"最高價(jià)(元)", 第[327, 328]行位置有缺失值
列名:"最低價(jià)(元)", 第[327, 328]行位置有缺失值
列名:"收盤價(jià)(元)", 第[327, 328]行位置有缺失值
................R解法library(glue)
for?(i in?names(df)){
??if(sum(is.na(df[,'日期'])) != 0){
????res1 is.na(df[,i]))
????res2 ',')
????print(glue('列名:"{i}", 第[{res2}]行有缺失值'))
??}
}56缺失值處理題目:刪除所有存在缺失值的行難度:??Python解法df.dropna(axis=0, how='any', inplace=True)
備注
axis:0-行操作(默認(rèn)),1-列操作how:any-只要有空值就刪除(默認(rèn)),all-全部為空值才刪除
inplace:False-返回新的數(shù)據(jù)集(默認(rèn)),True-在原數(shù)據(jù)集上操作57數(shù)據(jù)可視化題目:繪制收盤價(jià)的折線圖難度:??期望結(jié)果R解法library(ggplot2)
df?%>%
??ggplot(aes(日期,`收盤價(jià)(元)`)) +
??geom_line()58數(shù)據(jù)可視化題目:同時(shí)繪制開(kāi)盤價(jià)與收盤價(jià)難度:???期望結(jié)果R解法df %>%
??ggplot() +
??geom_line(aes(日期,`收盤價(jià)(元)`), size=1.2, color='steelblue') +
??geom_line(aes(日期,`開(kāi)盤價(jià)(元)`), size=1.2, color='orange') +
??ylab(c('價(jià)格(元)'))
# 這種畫出來(lái)沒(méi)有圖例,當(dāng)然可以手動(dòng)添加,但為了映射方便可以用另一種方法
library(tidyr)
df %>%
??select(日期,`開(kāi)盤價(jià)(元)`,`收盤價(jià)(元)`) %>%
??pivot_longer(c(`開(kāi)盤價(jià)(元)`,`收盤價(jià)(元)`),
???????????????names_to='type',values_to='price') %>%
??ggplot(aes(日期,price,color=type)) +
??geom_line(size=1.2) +
??scale_color_manual(values=c('steelblue','orange')) +
??theme_bw() +
??theme(
????panel.grid.major = element_blank(),
????panel.grid.minor = element_blank(),
????legend.title = element_blank(),
????legend.position = c(0.86, 0.9)
??)59數(shù)據(jù)可視化題目:繪制漲跌幅的直方圖難度:??期望結(jié)果R解法df?%>%
??ggplot(aes(`漲跌幅(%)`)) +
??geom_histogram()
# 可以指定bins60數(shù)據(jù)可視化題目:讓直方圖更細(xì)致難度:??期望結(jié)果R解法df?%>%
??ggplot(aes(`漲跌幅(%)`)) +
??geom_histogram(bins=30)61數(shù)據(jù)創(chuàng)建題目:以data的列名創(chuàng)建一個(gè)dataframe難度:??R解法temp?62異常值處理題目:打印所有換手率不是數(shù)字的行難度:???期望結(jié)果R解法#換手率這一列屬性為chr,需要先強(qiáng)轉(zhuǎn)數(shù)值型
#如果轉(zhuǎn)換失敗會(huì)變成NA,判斷即可
df[is.na(as.numeric(df$`換手率(%)`)),]63異常值處理題目:打印所有換手率為--的行難度:???R解法df?%>%
??filter(`換手率(%)`?== '--')
備注
通過(guò)上一題我們發(fā)現(xiàn)換手率的異常值只有--
64數(shù)據(jù)處理題目:重置data的行號(hào)難度:?R解法rownames(df)# 如果是tibble則索引始終是按順序
備注
有時(shí)我們修改數(shù)據(jù)會(huì)導(dǎo)致索引混亂
65異常值處理題目:刪除所有換手率為非數(shù)字的行難度:???R解法df[!is.na(as.numeric(df$`換手率(%)`)),]# 或者根據(jù)前幾題的經(jīng)驗(yàn),非數(shù)字就是'--'
df %
??filter(`換手率(%)` != '--')66數(shù)據(jù)可視化題目:繪制換手率的密度曲線難度:???期望結(jié)果R解法df$`換手率(%)` as.double(df$`換手率(%)`)
ggplot(df) +
??geom_density(aes(`換手率(%)`))67數(shù)據(jù)計(jì)算題目:計(jì)算前一天與后一天收盤價(jià)的差值難度:??R解法df?%>%
??summarise(delta = `收盤價(jià)(元)`?- lag(`收盤價(jià)(元)`))68數(shù)據(jù)計(jì)算題目:計(jì)算前一天與后一天收盤價(jià)變化率難度:??R解法df?%>%
??summarise(pct_change = (`收盤價(jià)(元)`?- lag(`收盤價(jià)(元)`))/lag(`收盤價(jià)(元)`))69數(shù)據(jù)處理題目:設(shè)置日期為索引難度:?R解法df %>%
??column_to_rownames(var='日期')70指標(biāo)計(jì)算
題目:以5個(gè)數(shù)據(jù)作為一個(gè)數(shù)據(jù)滑動(dòng)窗口,在這個(gè)5個(gè)數(shù)據(jù)上取均值(收盤價(jià))
難度:???R解法library(RcppRoll)df?%>%
??transmute(avg_5 = roll_mean(`收盤價(jià)(元)`,n = 5,align="right",fill = NA))71指標(biāo)計(jì)算
題目:以5個(gè)數(shù)據(jù)作為一個(gè)數(shù)據(jù)滑動(dòng)窗口,計(jì)算這五個(gè)數(shù)據(jù)總和(收盤價(jià))
難度:???R解法df?%>%??transmute(sum_5 = roll_sum(`收盤價(jià)(元)`,n = 5,align="right",fill = NA))72數(shù)據(jù)可視化
題目:將收盤價(jià)5日均線、20日均線與原始數(shù)據(jù)繪制在同一個(gè)圖上
難度:???期望結(jié)果R解法df?%>%??mutate(avg_5 = roll_mean(`收盤價(jià)(元)`,n = 5,align="right",fill = NA),
?????????avg_20 = roll_mean(`收盤價(jià)(元)`,n = 20,align="right",fill = NA)) %>%
??ggplot() +
??geom_line(aes(日期,`收盤價(jià)(元)`),color = 'steelblue',size = 1.2) +
??geom_line(aes(日期,avg_5),color = 'orange',size = 1.2) +
??geom_line(aes(日期,avg_20),color = 'green',size = 1.2)73數(shù)據(jù)重采樣
題目:按周為采樣規(guī)則,取一周收盤價(jià)最大值
? 難度:???R解法library(plyr)res?"1 week")),"[")
res_max res,function(n)max(n$`收盤價(jià)(元)`),simplify=TRUE)
as.data.frame(res_max)74數(shù)據(jù)可視化
題目:繪制重采樣數(shù)據(jù)與原始數(shù)據(jù)
??難度:???? 期望結(jié)果R解法res?%>%??rownames_to_column('date')
res$date as.Date(res$date)
ggplot(df) +
??geom_line(aes(日期,`收盤價(jià)(元)`),color = 'steelblue',size = 1.2) +
??geom_line(data = res, aes(date,res_max),
????????????color = 'orange',size = 1.2)75數(shù)據(jù)處理題目:將數(shù)據(jù)往后移動(dòng)5天難度:??R解法lag(df,5)76數(shù)據(jù)處理題目:將數(shù)據(jù)向前移動(dòng)5天難度:??R解法lead(df,5)77數(shù)據(jù)計(jì)算題目:使用expending函數(shù)計(jì)算開(kāi)盤價(jià)的移動(dòng)窗口均值難度:??R解法#R中沒(méi)有expanding完全一致的函數(shù)
#考慮到expanding實(shí)際功能就是累積均值
#可以用cummean
#但cummean的功能和我預(yù)想的不同
#可能是包之間相互干擾
#最后采用cumsum/1:n的形式完成本題
res %
??transmute(cummean = cumsum(`開(kāi)盤價(jià)(元)`)/1:dim(df)[1])78數(shù)據(jù)可視化題目:繪制上一題的移動(dòng)均值與原始數(shù)據(jù)折線圖難度:???期望結(jié)果
R解法library(tidyr)
df %>%
??cbind(res) %>%
??dplyr::rename(Opening_Price = `開(kāi)盤價(jià)(元)`,
?????????Expanding_Open_Mean = cummean) %>%
??select(日期,Opening_Price,Expanding_Open_Mean) %>%
??pivot_longer(c(Opening_Price,Expanding_Open_Mean),
???????????????names_to = 'type',
???????????????values_to ='price') %>%
??ggplot(aes(日期,price,color = type)) +
??geom_line(size=1.2) +
??scale_color_manual(values=c('orange','steelblue')) +
??theme_bw() +
??theme(
????panel.grid.major = element_blank(),
????panel.grid.minor = element_blank(),
????legend.title = element_blank(),
????legend.position = c(0.9, 0.9)
??)79數(shù)據(jù)計(jì)算題目:計(jì)算布林指標(biāo)難度:????R解法df %
??mutate(avg_20 = roll_mean(`收盤價(jià)(元)`,n = 20,align="right",fill = NA),
?????????upper_bound = avg_20 + 2?* roll_sd(`收盤價(jià)(元)`,n = 20,align="right",fill = NA),
?????????lower_bound = avg_20 - 2?* roll_sd(`收盤價(jià)(元)`,n = 20,align="right",fill = NA))80數(shù)據(jù)可視化題目:計(jì)算布林線并繪制難度:???期望結(jié)果R解法df %>%
??dplyr::rename(former_30_days_rolling_Close_mean = avg_20,
????????????????Closing_Price = `收盤價(jià)(元)`) %>%
??select(日期,Closing_Price,
???????????former_30_days_rolling_Close_mean,upper_bound,lower_bound) %>%
??pivot_longer(c(Closing_Price,former_30_days_rolling_Close_mean,upper_bound,lower_bound),
???????????????names_to = 'type',
???????????????values_to ='price') %>%
??ggplot(aes(日期,price,color = type)) +
??geom_line(size=1.2) +
??scale_color_manual(values=c('steelblue','orange','red','green')) +
??theme_bw() +
??theme(
????panel.grid.major = element_blank(),
????panel.grid.minor = element_blank(),
????legend.title = element_blank(),
????legend.position = c(0.6, 0.2)
??)81數(shù)據(jù)查看題目:導(dǎo)入并查看pandas與numpy版本難度:?R語(yǔ)言解法packageVersion("tidyverse")
# [1] ‘1.3.0’
packageVersion("dplyr")
# [1] ‘0.8.99.9002’82數(shù)據(jù)創(chuàng)建題目:從NumPy數(shù)組創(chuàng)建DataFrame難度:?備注使用numpy生成20個(gè)0-100隨機(jī)數(shù)R語(yǔ)言解法df1 20,function(n)?{
??replicate(n,sample(1:100,1))
}) %>%
??as.data.frame(.) %>%
??dplyr::rename(`0` = V1)83數(shù)據(jù)創(chuàng)建題目:從NumPy數(shù)組創(chuàng)建DataFrame難度:?備注使用numpy生成20個(gè)0-100固定步長(zhǎng)的數(shù)R語(yǔ)言解法df2 as.data.frame(seq(0,99,5)) %>%
??dplyr::rename(`0` = "seq(0, 99, 5)")84數(shù)據(jù)創(chuàng)建題目:從NumPy數(shù)組創(chuàng)建DataFrame難度:?備注使用numpy生成20個(gè)指定分布(如標(biāo)準(zhǔn)正態(tài)分布)的數(shù)R語(yǔ)言解法df3 as.data.frame(rnorm(20,0,1)) %>%
??dplyr::rename(`0` = "rnorm(20, 0, 1)")85數(shù)據(jù)創(chuàng)建題目:將df1,df2,df3按照行合并為新DataFrame難度:??R語(yǔ)言解法df?86數(shù)據(jù)創(chuàng)建題目:將df1,df2,df3按照列合并為新DataFrame難度:??期望結(jié)果0 1 2
0 95 0 0.022492
1 22 5?-1.209494
2 3 10 0.876127
3 21 15?-0.162149
4 51 20?-0.815424
5 30 25?-0.303792
...............R語(yǔ)言解法df?names(df) 0,1,2)87數(shù)據(jù)查看題目:查看df所有數(shù)據(jù)的最小值、25%分位數(shù)、中位數(shù)、75%分位數(shù)、最大值難度:??R語(yǔ)言解法summary(unlist(df))88數(shù)據(jù)修改題目:修改列名為col1,col2,col3難度:?R語(yǔ)言解法df %
??dplyr::rename(col1 = 1,
????????????????col2 = 2,
????????????????col3 = 3)
# 或者用類似pandas的方法
names(df) 'col1','col2','col3')89數(shù)據(jù)提取題目:提取第一列中不在第二列出現(xiàn)的數(shù)字難度:???R語(yǔ)言解法df[!(df$col1?%in% df$col2),1]90數(shù)據(jù)提取題目:提取第一列和第二列出現(xiàn)頻率最高的三個(gè)數(shù)字難度:???R語(yǔ)言解法count(unlist(c(df$col1,df$col2))) %>%
??arrange(desc(freq)) %>%
??filter(row_number() <= 3)91數(shù)據(jù)提取題目:提取第一列中可以整除5的數(shù)字位置難度:???R語(yǔ)言解法which(df['col1'] %% 5==0)92數(shù)據(jù)計(jì)算題目:計(jì)算第一列數(shù)字前一個(gè)與后一個(gè)的差值難度:??R語(yǔ)言解法df?%>%
??summarise(col1 - lag(col1)) %>%
??na.omit(.) # 不去NA也可以,pandas沒(méi)有去除93數(shù)據(jù)處理題目:將col1,col2,clo3三列順序顛倒難度:??R語(yǔ)言解法df?%>%
??select(col3,col2,everything())94數(shù)據(jù)提取題目:提取第一列位置在1,10,15的數(shù)字難度:??R語(yǔ)言解法df[c(1,10,15) + 1,1]95數(shù)據(jù)查找題目:查找第一列的局部最大值位置難度:????備注即比它前一個(gè)與后一個(gè)數(shù)字的都大的數(shù)字R語(yǔ)言解法res1 0))
res2 0))
intersect(res1,res2)
# [1] 3 5 7 12 14 17 19
# 另一種方法,類似pandas的用符號(hào)判斷
res
which(res - lag(res) == -2) - 1
# # [1] 3 5 7 12 14 17 1996數(shù)據(jù)計(jì)算題目:按行計(jì)算df的每一行均值難度:??R語(yǔ)言解法rowMeans(df)97數(shù)據(jù)計(jì)算題目:對(duì)第二列計(jì)算移動(dòng)平均值難度:???備注每次移動(dòng)三個(gè)位置,不可以使用自定義函數(shù)R語(yǔ)言解法library(RcppRoll)
df?%>%
??summarise(avg_3 = roll_mean(col2, n=3))98數(shù)據(jù)修改題目:將數(shù)據(jù)按照第三列值的大小升序排列難度:??R語(yǔ)言解法df %
??arrange(col3)99數(shù)據(jù)修改題目:將第一列大于50的數(shù)字修改為'高'難度:??R語(yǔ)言解法df[df$col1 > 50,1] '高'100數(shù)據(jù)計(jì)算題目:計(jì)算第一列與第二列之間的歐式距離難度:???備注不可以使用自定義函數(shù)R語(yǔ)言解法# 可以利用概念計(jì)算
res 2
sqrt(sum(res))
# [1] 197.0102
# 也可以利用dist函數(shù),但需要形成兩個(gè)不同的觀測(cè)
dist(rbind(df$col1,df$col2))
# 1
# 2 197.0102101數(shù)據(jù)讀取題目:從CSV文件中讀取指定數(shù)據(jù)難度:??備注從數(shù)據(jù)1中的前10行中讀取positionName, salary兩列R語(yǔ)言解法#一步讀取文件的指定列用readr包或者原生函數(shù)都沒(méi)辦法
#如果文件特別大又不想全部再選指定列可以用如下辦法
#基本思想先讀取較少的數(shù)據(jù)獲取列名
#給目標(biāo)列以外的列打上NULL導(dǎo)致第二次讀取文件時(shí)NULL列丟失即可
res '數(shù)據(jù)1.csv',encoding = 'GBK',nrows = 3)
classes class)
classes[-match(c('positionName','salary'),names(classes))] ??rep('NULL', length(classes)?- 2)
df?read.csv('數(shù)據(jù)1.csv',encoding = 'GBK',nrows = 10,
???????????????colClasses = classes)102數(shù)據(jù)讀取題目:從CSV文件中讀取指定數(shù)據(jù)難度:??備注從數(shù)據(jù)2中讀取數(shù)據(jù)并在讀取數(shù)據(jù)時(shí)將薪資大于10000的為改為高R語(yǔ)言解法library(readr)
df2 '數(shù)據(jù)2.csv') %>%
??mutate('學(xué)歷要求',
?????????'薪資水平'?= ifelse(
???????????薪資水平 > 10000,'高','低'))103數(shù)據(jù)計(jì)算題目:從dataframe提取數(shù)據(jù)難度:???備注從上一題數(shù)據(jù)中,對(duì)薪資水平列每隔20行進(jìn)行一次抽樣
期望結(jié)果
R語(yǔ)言解法df2[seq(1,dim(df2)[1],20),]104數(shù)據(jù)處理題目:將數(shù)據(jù)取消使用科學(xué)計(jì)數(shù)法難度:??輸入df?= pd.DataFrame(np.random.random(10)**10, columns=['data'])
期望結(jié)果
R語(yǔ)言解法
df 10)^10)round(df,3)105數(shù)據(jù)處理題目:將上一題的數(shù)據(jù)轉(zhuǎn)換為百分?jǐn)?shù)難度:???期望結(jié)果R語(yǔ)言解法tibble(data = str_glue('{round(df$data * 100,2)}%'))106數(shù)據(jù)查找題目:查找上一題數(shù)據(jù)中第3大值的行號(hào)難度:???R語(yǔ)言解法df?%>%
??mutate(nrow = rownames(.)) %>%
??arrange(desc(data)) %>%
??filter(row_number() == 3) %>%
??select(nrow)107數(shù)據(jù)處理題目:反轉(zhuǎn)df的行難度:??R語(yǔ)言解法df?%>%
??arrange(desc(rownames(.)))108數(shù)據(jù)重塑題目:按照多列對(duì)數(shù)據(jù)進(jìn)行合并難度:??輸入df1= pd.DataFrame({'key1': ['K0',?'K0',?'K1',?'K2'],
'key2': ['K0',?'K1',?'K0',?'K1'],
'A': ['A0',?'A1',?'A2',?'A3'],
'B': ['B0',?'B1',?'B2',?'B3']})
df2= pd.DataFrame({'key1': ['K0',?'K1',?'K1',?'K2'],
'key2': ['K0',?'K0',?'K0',?'K0'],
'C': ['C0',?'C1',?'C2',?'C3'],
'D': ['D0',?'D1',?'D2',?'D3']})R語(yǔ)言解法df1 ??"key1"?= c("K0","K0","K1","K2"),
??"key2"?= c("K0","K1","K0","K1"),
??"A"?= paste0('A',0:3),
??"B"?= paste0('B',0:3)
)
df2 ??"key1"?= c("K0","K1","K1","K2"),
??"key2"?= paste0('K',rep(0,4)),
??"C"?= paste0('C',0:3),
??"D"?= paste0('D',0:3)
)
full_join(df1,df2,by?= c('key1','key2')) %>%
??na.omit(.)109數(shù)據(jù)重塑題目:按照多列對(duì)數(shù)據(jù)進(jìn)行合并難度:??
備注
只保存df1的數(shù)據(jù)
R語(yǔ)言解法left_join(df1,df2,by?= c('key1','key2'))110數(shù)據(jù)處理題目:再次讀取數(shù)據(jù)1并顯示所有的列難度:??備注數(shù)據(jù)中由于列數(shù)較多中間列不顯示R語(yǔ)言解法df '數(shù)據(jù)1.csv', locale = locale(encoding = "GBK")) %>%??print(width = Inf)111數(shù)據(jù)查找題目:查找secondType與thirdType值相等的行號(hào)難度:??R語(yǔ)言解法df?%>%
??mutate(nrow = rownames(.)) %>%
??filter(secondType == thirdType) %>%
??select(nrow) %>%
??unlist()112數(shù)據(jù)查找題目:查找薪資大于平均薪資的第三個(gè)數(shù)據(jù)難度:???R語(yǔ)言解法df %>%
??mutate(nrow = rownames(.)) %>%
??filter(salary > mean(salary)) %>%
??select(nrow) %>%
??filter(row_number() == 3)
# # A tibble: 1 x 1
# nrow
#
# 1 6113數(shù)據(jù)計(jì)算題目:將上一題數(shù)據(jù)的salary列開(kāi)根號(hào)難度:??R語(yǔ)言解法df %>%
??summarise(salary_sqrt = sqrt(salary))114數(shù)據(jù)處理題目:將上一題數(shù)據(jù)的linestaion列按_拆分難度:??R語(yǔ)言解法df %
??mutate(split?= str_split(linestaion,'_'))115數(shù)據(jù)查看題目:查看上一題數(shù)據(jù)中一共有多少列難度:?R語(yǔ)言解法length(df)
# [1] 54116數(shù)據(jù)提取題目:提取industryField列以'數(shù)據(jù)'開(kāi)頭的行難度:??R語(yǔ)言解法df[grep("^數(shù)據(jù)", df$industryField),]117數(shù)據(jù)計(jì)算題目:以salary score 和 positionID制作數(shù)據(jù)透視難度:???R語(yǔ)言解法df %
??group_by(positionId) %>%
??dplyr::summarise(salary = mean(salary),
????????????score = mean(score)) %>%
??as.data.frame(.)
rownames(df) NULL
tibble::column_to_rownames(df,var='positionId')118數(shù)據(jù)計(jì)算題目:同時(shí)對(duì)salary、score兩列進(jìn)行計(jì)算難度:???R語(yǔ)言解法res?%
??select(salary,score) %>%
??pivot_longer(c(salary,score),names_to = 'type',values_to = 'value') %>%
??group_by(type) %>%
??summarise(sum = sum(value),mean = mean(value),min?= min(value))
rownames(res)
res?%>%
??column_to_rownames('type') %>%
??t(.)119數(shù)據(jù)計(jì)算題目:對(duì)不同列執(zhí)行不同的計(jì)算難度:???備注對(duì)salary求平均,對(duì)score列求和R語(yǔ)言解法df?%>%
??summarise(salary_sum = sum(salary),
????????????score_mean = mean(score))120數(shù)據(jù)計(jì)算題目:計(jì)算并提取平均薪資最高的區(qū)難度:????R語(yǔ)言解法df?%>%
??group_by(district) %>%
??summarise(avg = mean(salary)) %>%
??arrange(desc(avg)) %>%
??filter(row_number() == 1)
以上就是玩轉(zhuǎn)數(shù)據(jù)處理120題|R語(yǔ)言版全部?jī)?nèi)容,如果能堅(jiān)持走到這里的讀者,我想你已經(jīng)掌握了處理數(shù)據(jù)的常用操作,并且在之后的數(shù)據(jù)分析中碰到相關(guān)問(wèn)題,希望你能夠從容的解決!
R語(yǔ)言解法作者介紹:陳熹,中山大學(xué)中山眼科中心博士在讀,不安分的python R SQL愛(ài)好者
興趣范圍:生物信息 / 數(shù)據(jù)分析 / 網(wǎng)絡(luò)爬蟲 / 機(jī)器學(xué)習(xí)
簡(jiǎn)書:半為花間酒
Email:chenx6542@foxmail.com
《新程序員》:云原生和全面數(shù)字化實(shí)踐50位技術(shù)專家共同創(chuàng)作,文字、視頻、音頻交互閱讀總結(jié)
以上是生活随笔為你收集整理的go语言csv包_玩转数据处理120题R语言版本的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: python 生成001开始的序号_你知
- 下一篇: import pymysql 没有模块_