SlideShare une entreprise Scribd logo
1  sur  73
Télécharger pour lire hors ligne
> me
$name
[1] "Takashi Kitano"
$twitter
[1] "@kashitan"
$work_in
[1] " " -> " ( 6/30)"
新しい時代が
始まって2ヶ⽉
みなさまいかが
お過ごしでしょうか?
⼼機⼀転した⽅も
多いのでは
そうだ
Rで読書Hack
しよう
titles <- c(" ", " ",
" ", " ",
"7 ")
books <- dplyr::tibble(
docId = as.character(1:length(titles)),
title = titles,
author = c(rep(" ", 2),
rep(" F ", 2),
" R "))
books %>% knitr::kable()
docId title author
1
2
3 F
4 F
5 7 R
sentDf %>% head() %>% knitr::kable()
docId sectionId sentenceId sentence
1 1_0006 1_0006_0001 ──
1 1_0006 1_0006_0002
1 1_0006 1_0006_0003
1 1_0006 1_0006_0004
1 1_0006 1_0006_0005
1 1_0006 1_0006_0006
tokenDf <- sentDf %>%
as.data.frame() %>%
RMeCab::RMeCabDF("sentence", 1) %>%
purrr::set_names(nm = sentDf$sentenceId) %>%
purrr::map2_dfr(.x = ., .y = names(.), .f = function(x, y) {
tibble(docId = stringr::str_replace(y, "_.*", ""),
sectionId = stringr::str_replace(y, "(.+_.*)_.*", "1"),
sentenceId = y,
token = x,
hinshi = names(x))
})
docId sectionId sentenceId token hinshi
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001 ─
1 1_0006 1_0006_0001 ─
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
tokenDf %>% head(n = 10) %>% knitr::kable()
https://www.intage.co.jp/glossary/400/
https://www.gastonsanchez.com/visually-enforced/how-to/2012/07/19/Correspondence-Analysis/
https://www.gastonsanchez.com/visually-enforced/how-to/2012/07/19/Correspondence-Analysis/
https://youtu.be/dE10fBCDWQc
sw <- c(" ", " ", " ", " ", " ", " ", " ", " ",
" ", " ", " ")
CA_in <- tokenDf %>%
dplyr::filter(hinshi %in% c(" ", " ")) %>%
dplyr::filter(! token %in% sw) %>%
dplyr::count(docId, token) %>%
dplyr::ungroup() %>%
dplyr::inner_join(books, by = "docId") %>%
dplyr::select(token, title, n) %>%
tidyr::spread(key = title, value=n, fill = 0) %>%
dplyr::mutate(max = pmax(!!!rlang::syms(titles))) %>%
dplyr::top_n(n = 60, wt = max) %>%
as.data.frame()
rownames(CA_in) <- CA_in$token
CA_in <- CA_in %>%
select(one_of(titles))
CA_in %>% head() %>% knitr::kable()
7
0 1 0 0 269
17 15 166 431 110
0 0 0 0 444
148 116 1 15 538
103 71 69 94 246
182 175 87 87 236
FactoMineR::CA(CA_in)
『対応分析⼊⾨』の著者・藤本先⽣が
issue上げるも2年近く放置。。。
d <- dplyr::tibble(
x = CA_out$row$coord[, 1],
y = CA_out$row$coord[, 2],
label = rownames(CA_out$row$coord),
type = "row"
) %>%
dplyr::bind_rows(tibble(
x = CA_out$col$coord[, 1],
y = CA_out$col$coord[, 2],
label = rownames(CA_out$col$coord),
type = "col"
))
d %>% head(n = 12) %>% knitr::kable()
x y label type
-0.4651474 -1.4411540 row
0.9067268 0.1419623 row
-0.4639592 -1.4488050 row
-0.5503475 -0.6609927 row
-0.0656860 -0.2595280 row
-0.2077013 0.0100741 row
0.0335994 0.0251759 row
-0.5759668 0.0906504 row
-0.1080142 0.2464559 row
-0.0559726 0.0791701 row
labels <- glue::glue("Dim {axis} ({format(var, digits = 3, nsmall =
1)} %)",
axis = c(1, 2), var = CA_out$eig[1:2, 2])
d %>%
ggplot2::ggplot(aes(x = x, y = y, label = label,
shape = type, colour = type)) +
ggplot2::geom_vline(xintercept = 0, linetype = "dashed") +
ggplot2::geom_hline(yintercept = 0, linetype = "dashed") +
ggplot2::geom_point() +
ggrepel::geom_text_repel(family = "HiraMaruProN-W4") +
ggplot2::scale_shape_manual(values = c(17, 16)) +
ggplot2::xlab(labels[1]) +
ggplot2::ylab(labels[2]) +
ggplot2::ggtitle("CA - Biplot") +
ggplot2::theme(legend.position = "none")
TF-IDFで
対応分析しよう
CA_in <- tokenDf %>%
dplyr::filter(hinshi %in% c(" ", " ")) %>%
dplyr::filter(! token %in% sw) %>%
dplyr::count(docId, token) %>%
dplyr::ungroup() %>%
tidytext::bind_tf_idf(term = token, document = docId, n = n) %>%
dplyr::inner_join(books, by = "docId") %>%
dplyr::select(token, title, tf_idf) %>%
tidyr::spread(key = title, value=tf_idf, fill = 0.0) %>%
dplyr::mutate(tfidf_max = pmax(!!!rlang::syms(titles))) %>%
dplyr::top_n(n = 60, wt = tfidf_max) %>%
as.data.frame()
rownames(CA_in) <- CA_in$token
CA_in <- CA_in %>%
select(one_of(titles))
CA_in %>% head() %>% knitr::kable()
7
0.00E+00 0.0000224 0 0.00E+00 0.0036436
5.91E-05 0.0019978 0 0.00E+00 0.0000151
5.91E-05 0.0019853 0 0.00E+00 0.0000151
0.00E+00 0.0000000 0 0.00E+00 0.0014751
Ⅱ 0.00E+00 0.0000000 0 2.49E-05 0.0008940
0.00E+00 0.0000000 0 0.00E+00 0.0025933


トピックモデル
で分析してみよう
https://www.albert2005.co.jp/knowledge/machine_learning/topic_model/about_topic_model
https://youtu.be/dE10fBCDWQc
LDA_in <- tokenDf %>%
dplyr::filter(hinshi %in% c(" ", " ")) %>%
dplyr::filter(! token %in% sw) %>%
dplyr::group_by(docId, token) %>%
dplyr::count(token) %>%
dplyr::ungroup() %>%
dplyr::group_by(token) %>%
dplyr::mutate(total = sum(n)) %>%
dplyr::ungroup() %>%
dplyr::top_n(n = 5000, wt = total) %>%
dplyr::inner_join(books, by = "docId") %>%
tidytext::cast_dtm(title, token, n)
LDA_out <- topicmodels::LDA(LDA_in, k = 5,
control = list(seed = 123))
LDA_out %>%
tidytext::tidy(matrix = "gamma") %>%
ggplot2::ggplot(aes(factor(topic), gamma)) +
ggplot2::geom_boxplot() +
ggplot2::facet_wrap(~ document) +
ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
LDA_out %>%
tidytext::tidy() %>%
dplyr::group_by(topic) %>%
dplyr::top_n(n = 10, wt = beta) %>%
dplyr::ungroup() %>%
dplyr::mutate(term = reorder(term, beta)) %>%
ggplot2::ggplot(aes(term, beta, fill = factor(topic))) +
ggplot2::geom_col(show.legend = FALSE) +
ggplot2::facet_wrap(~ topic, scales = "free_y") +
ggplot2::coord_flip() +
ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")


https://logmi.jp/business/articles/156592
⽂章要約しよう
https://qiita.com/icoxfog417/items/d06651db10e27220c819
https://qiita.com/icoxfog417/items/d06651db10e27220c819
tokenDf <- tokenDf %>%
dplyr::filter(docId == "1") %>%
dplyr::group_by(sectionId) %>%
tidyr::nest()
tokenDf %>% head()
# A tibble: 6 x 2
sectionId data
<chr> <list>
1 1_0006 <tibble [2,642 × 4]>
2 1_0007 <tibble [1,432 × 4]>
3 1_0009 <tibble [5,928 × 4]>
4 1_0010 <tibble [3,755 × 4]>
5 1_0011 <tibble [5,135 × 4]>
6 1_0013 <tibble [3,440 × 4]>
tokenDf$data[[1]] %>%
head() %>%
knitr::kable()
docId sentenceId token hinshi
1 1_0006_0001
1 1_0006_0001 ─
1 1_0006_0001 ─
1 1_0006_0001
1 1_0006_0001
1 1_0006_0001
tokenDf$data[[2]] %>%
head() %>%
knitr::kable()
docId sentenceId token hinshi
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
simDf <- tokenDf %>%
# lexRankr::sentenceSimil()
dplyr::mutate(sentenceId = purrr::map(.$data, ~.$sentenceId),
token = purrr::map(.$data, ~.$token),
docId = purrr::map2(sectionId, token,
function(x, y) {
rep(x, length(y))
}
)) %>%
#
dplyr::mutate(simil = purrr::pmap(list(sentenceId, token, docId),
lexRankr::sentenceSimil)) %>%
select(sectionId, simil)
simDf %>% head()
# A tibble: 6 x 2
sectionId simil
<chr> <list>
1 1_0006 <df[,3] [5,671 × 3]>
2 1_0007 <df[,3] [2,080 × 3]>
3 1_0009 <df[,3] [39,340 × 3]>
4 1_0010 <df[,3] [13,695 × 3]>
5 1_0011 <df[,3] [25,651 × 3]>
6 1_0013 <df[,3] [12,880 × 3]>
simDf$simil[[1]] %>% head() %>% knitr::kable()
sent1 sent2 similVal
1_0006_0001 1_0006_0002 0.5045201
1_0006_0001 1_0006_0003 0.4682931
1_0006_0001 1_0006_0004 0.0000000
1_0006_0001 1_0006_0005 0.5541319
1_0006_0001 1_0006_0006 0.5856045
1_0006_0001 1_0006_0007 0.4752808
topNSents <- simDf %>%
# lexRankr::lexRankFromSimil
dplyr::mutate(s1 = purrr::map(.$simil, ~.$sent1),
s2 = purrr::map(.$simil, ~.$sent2),
simil = purrr::map(.$simil, ~.$similVal),
n = purrr::map(.$sentenceId, function(x) {
as.integer(ceiling(dplyr::n_distinct(x) * 0.1))
})) %>%
#
dplyr::mutate(topN = purrr::pmap(list(s1, s2, simil, n),
lexRankr::lexRankFromSimil,
threshold = 0.2,
continuous = TRUE))
topNSents %>% head()
# A tibble: 6 x 2
sectionId topN
<chr> <list>
1 1_0006 <df[,2] [11 × 2]>
2 1_0007 <df[,2] [7 × 2]>
3 1_0009 <df[,2] [29 × 2]>
4 1_0010 <df[,2] [17 × 2]>
5 1_0011 <df[,2] [23 × 2]>
6 1_0013 <df[,2] [17 × 2]>
topNSents$topN[[1]] %>% knitr::kable()
sentenceId value
1_0006_0012 0.0110008
1_0006_0025 0.0110065
1_0006_0027 0.0109621
1_0006_0028 0.0110488
1_0006_0033 0.0109785
1_0006_0040 0.0110083
1_0006_0041 0.0110468
1_0006_0047 0.0110727
1_0006_0059 0.0110430
1_0006_0085 0.0110101
1_0006_0086 0.0109732
res <- topNSents %>%
select(topN) %>%
tidyr::unnest() %>%
dplyr::inner_join(sentDf, by = “sentenceId")
res %>% head() %>% knitr::kable()
sentenceI
d
value docI
d
section
Id
sentence
1_0006_0
012
0.0110008 1 1_0006
1_0006_0
025
0.0110065 1 1_0006
1_0006_0
027
0.0109621 1 1_0006
1_0006_0
028
0.0110488 1 1_0006
1_0006_0
033
0.0109785 1 1_0006
1_0006_0
040
0.0110083 1 1_0006 ──
sentDf %>%
filter(docId == "1") %>%
group_by(sectionId) %>%
count()
# A tibble: 31 x 2
# Groups: sectionId [31]
sectionId n
<chr> <int>
1 1_0006 107
2 1_0007 65
3 1_0009 281
4 1_0010 166
5 1_0011 227
6 1_0013 161
res %>%
filter(docId == "1") %>%
group_by(sectionId) %>%
count()
# A tibble: 31 x 2
# Groups: sectionId [31]
sectionId n
<chr> <int>
1 1_0006 11
2 1_0007 7
3 1_0009 29
4 1_0010 17
5 1_0011 23
6 1_0013 17
Rで読書Hackと対応分析
Rで読書Hackと対応分析
Rで読書Hackと対応分析

Contenu connexe

Tendances

Webmontag Berlin "coffee script"
Webmontag Berlin "coffee script"Webmontag Berlin "coffee script"
Webmontag Berlin "coffee script"Webmontag Berlin
 
Clustering com numpy e cython
Clustering com numpy e cythonClustering com numpy e cython
Clustering com numpy e cythonAnderson Dantas
 
Pre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPaweł Dawczak
 
Coding Horrors
Coding HorrorsCoding Horrors
Coding HorrorsMark Baker
 
An Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackVic Metcalfe
 
Modern Application Foundations: Underscore and Twitter Bootstrap
Modern Application Foundations: Underscore and Twitter BootstrapModern Application Foundations: Underscore and Twitter Bootstrap
Modern Application Foundations: Underscore and Twitter BootstrapHoward Lewis Ship
 
Association Rule Mining with R
Association Rule Mining with RAssociation Rule Mining with R
Association Rule Mining with RYanchang Zhao
 
Let’s Talk About Ruby
Let’s Talk About RubyLet’s Talk About Ruby
Let’s Talk About RubyIan Bishop
 
Magicke metody v Pythonu
Magicke metody v PythonuMagicke metody v Pythonu
Magicke metody v PythonuJirka Vejrazka
 
ICP2014: Pimp dein Apigility
ICP2014: Pimp dein ApigilityICP2014: Pimp dein Apigility
ICP2014: Pimp dein ApigilityRalf Eggert
 
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기Suyeol Jeon
 
Groovy collection api
Groovy collection apiGroovy collection api
Groovy collection apitrygvea
 
WordPress 3.1 at DC PHP
WordPress 3.1 at DC PHPWordPress 3.1 at DC PHP
WordPress 3.1 at DC PHPandrewnacin
 
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲Mohammad Reza Kamalifard
 
Python data structures
Python data structuresPython data structures
Python data structuresHarry Potter
 

Tendances (18)

Webmontag Berlin "coffee script"
Webmontag Berlin "coffee script"Webmontag Berlin "coffee script"
Webmontag Berlin "coffee script"
 
Clustering com numpy e cython
Clustering com numpy e cythonClustering com numpy e cython
Clustering com numpy e cython
 
Pre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to Elixir
 
Coding Horrors
Coding HorrorsCoding Horrors
Coding Horrors
 
An Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: Hack
 
Python 1
Python 1Python 1
Python 1
 
Modern Application Foundations: Underscore and Twitter Bootstrap
Modern Application Foundations: Underscore and Twitter BootstrapModern Application Foundations: Underscore and Twitter Bootstrap
Modern Application Foundations: Underscore and Twitter Bootstrap
 
Association Rule Mining with R
Association Rule Mining with RAssociation Rule Mining with R
Association Rule Mining with R
 
Let’s Talk About Ruby
Let’s Talk About RubyLet’s Talk About Ruby
Let’s Talk About Ruby
 
Elixir
ElixirElixir
Elixir
 
Magicke metody v Pythonu
Magicke metody v PythonuMagicke metody v Pythonu
Magicke metody v Pythonu
 
ICP2014: Pimp dein Apigility
ICP2014: Pimp dein ApigilityICP2014: Pimp dein Apigility
ICP2014: Pimp dein Apigility
 
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기
 
Intoduction to php arrays
Intoduction to php arraysIntoduction to php arrays
Intoduction to php arrays
 
Groovy collection api
Groovy collection apiGroovy collection api
Groovy collection api
 
WordPress 3.1 at DC PHP
WordPress 3.1 at DC PHPWordPress 3.1 at DC PHP
WordPress 3.1 at DC PHP
 
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
 
Python data structures
Python data structuresPython data structures
Python data structures
 

Similaire à Rで読書Hackと対応分析

JQuery In Rails
JQuery In RailsJQuery In Rails
JQuery In RailsLouie Zhao
 
Writing Readable Code with Pipes
Writing Readable Code with PipesWriting Readable Code with Pipes
Writing Readable Code with PipesRsquared Academy
 
Optimization and Mathematical Programming in R and ROI - R Optimization Infra...
Optimization and Mathematical Programming in R and ROI - R Optimization Infra...Optimization and Mathematical Programming in R and ROI - R Optimization Infra...
Optimization and Mathematical Programming in R and ROI - R Optimization Infra...Dr. Volkan OBAN
 
Next Generation Programming in R
Next Generation Programming in RNext Generation Programming in R
Next Generation Programming in RFlorian Uhlitz
 
Юрий Буянов «Squeryl — ORM с человеческим лицом»
Юрий Буянов «Squeryl — ORM с человеческим лицом»Юрий Буянов «Squeryl — ORM с человеческим лицом»
Юрий Буянов «Squeryl — ORM с человеческим лицом»e-Legion
 
Basic R Data Manipulation
Basic R Data ManipulationBasic R Data Manipulation
Basic R Data ManipulationChu An
 
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)Wataru Shito
 
Time Series Analysis:Basic Stochastic Signal Recovery
Time Series Analysis:Basic Stochastic Signal RecoveryTime Series Analysis:Basic Stochastic Signal Recovery
Time Series Analysis:Basic Stochastic Signal RecoveryDaniel Cuneo
 
Regression and Classification with R
Regression and Classification with RRegression and Classification with R
Regression and Classification with RYanchang Zhao
 
Data manipulation with dplyr
Data manipulation with dplyrData manipulation with dplyr
Data manipulation with dplyrRomain Francois
 
M11 bagging loo cv
M11 bagging loo cvM11 bagging loo cv
M11 bagging loo cvRaman Kannan
 
SevillaR meetup: dplyr and magrittr
SevillaR meetup: dplyr and magrittrSevillaR meetup: dplyr and magrittr
SevillaR meetup: dplyr and magrittrRomain Francois
 

Similaire à Rで読書Hackと対応分析 (20)

JQuery In Rails
JQuery In RailsJQuery In Rails
JQuery In Rails
 
dplyr
dplyrdplyr
dplyr
 
R programming language
R programming languageR programming language
R programming language
 
R Programming Homework Help
R Programming Homework HelpR Programming Homework Help
R Programming Homework Help
 
Writing Readable Code with Pipes
Writing Readable Code with PipesWriting Readable Code with Pipes
Writing Readable Code with Pipes
 
Optimization and Mathematical Programming in R and ROI - R Optimization Infra...
Optimization and Mathematical Programming in R and ROI - R Optimization Infra...Optimization and Mathematical Programming in R and ROI - R Optimization Infra...
Optimization and Mathematical Programming in R and ROI - R Optimization Infra...
 
Next Generation Programming in R
Next Generation Programming in RNext Generation Programming in R
Next Generation Programming in R
 
Юрий Буянов «Squeryl — ORM с человеческим лицом»
Юрий Буянов «Squeryl — ORM с человеческим лицом»Юрий Буянов «Squeryl — ORM с человеческим лицом»
Юрий Буянов «Squeryl — ORM с человеческим лицом»
 
Data aggregation in R
Data aggregation in RData aggregation in R
Data aggregation in R
 
Basic R Data Manipulation
Basic R Data ManipulationBasic R Data Manipulation
Basic R Data Manipulation
 
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
 
Time Series Analysis:Basic Stochastic Signal Recovery
Time Series Analysis:Basic Stochastic Signal RecoveryTime Series Analysis:Basic Stochastic Signal Recovery
Time Series Analysis:Basic Stochastic Signal Recovery
 
Regression and Classification with R
Regression and Classification with RRegression and Classification with R
Regression and Classification with R
 
Introduction to tibbles
Introduction to tibblesIntroduction to tibbles
Introduction to tibbles
 
Dplyr and Plyr
Dplyr and PlyrDplyr and Plyr
Dplyr and Plyr
 
Data manipulation with dplyr
Data manipulation with dplyrData manipulation with dplyr
Data manipulation with dplyr
 
library(sparkline)
library(sparkline)library(sparkline)
library(sparkline)
 
M11 bagging loo cv
M11 bagging loo cvM11 bagging loo cv
M11 bagging loo cv
 
SevillaR meetup: dplyr and magrittr
SevillaR meetup: dplyr and magrittrSevillaR meetup: dplyr and magrittr
SevillaR meetup: dplyr and magrittr
 
Tt subtemplates-caching
Tt subtemplates-cachingTt subtemplates-caching
Tt subtemplates-caching
 

Plus de Takashi Kitano

20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門Takashi Kitano
 
mxnetで頑張る深層学習
mxnetで頑張る深層学習mxnetで頑張る深層学習
mxnetで頑張る深層学習Takashi Kitano
 
可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜Takashi Kitano
 
Rによるウイスキー分析
Rによるウイスキー分析Rによるウイスキー分析
Rによるウイスキー分析Takashi Kitano
 
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開verTakashi Kitano
 
20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyorTakashi Kitano
 
lubridateパッケージ入門
lubridateパッケージ入門lubridateパッケージ入門
lubridateパッケージ入門Takashi Kitano
 
Google's r style guideのすゝめ
Google's r style guideのすゝめGoogle's r style guideのすゝめ
Google's r style guideのすゝめTakashi Kitano
 

Plus de Takashi Kitano (10)

20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門
 
mxnetで頑張る深層学習
mxnetで頑張る深層学習mxnetで頑張る深層学習
mxnetで頑張る深層学習
 
可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜
 
Rによるウイスキー分析
Rによるウイスキー分析Rによるウイスキー分析
Rによるウイスキー分析
 
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
 
20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor
 
lubridateパッケージ入門
lubridateパッケージ入門lubridateパッケージ入門
lubridateパッケージ入門
 
20150329 tokyo r47
20150329 tokyo r4720150329 tokyo r47
20150329 tokyo r47
 
20140920 tokyo r43
20140920 tokyo r4320140920 tokyo r43
20140920 tokyo r43
 
Google's r style guideのすゝめ
Google's r style guideのすゝめGoogle's r style guideのすゝめ
Google's r style guideのすゝめ
 

Dernier

Principles and Practices of Data Visualization
Principles and Practices of Data VisualizationPrinciples and Practices of Data Visualization
Principles and Practices of Data VisualizationKianJazayeri1
 
Student Profile Sample report on improving academic performance by uniting gr...
Student Profile Sample report on improving academic performance by uniting gr...Student Profile Sample report on improving academic performance by uniting gr...
Student Profile Sample report on improving academic performance by uniting gr...Seán Kennedy
 
Advanced Machine Learning for Business Professionals
Advanced Machine Learning for Business ProfessionalsAdvanced Machine Learning for Business Professionals
Advanced Machine Learning for Business ProfessionalsVICTOR MAESTRE RAMIREZ
 
Student profile product demonstration on grades, ability, well-being and mind...
Student profile product demonstration on grades, ability, well-being and mind...Student profile product demonstration on grades, ability, well-being and mind...
Student profile product demonstration on grades, ability, well-being and mind...Seán Kennedy
 
SMOTE and K-Fold Cross Validation-Presentation.pptx
SMOTE and K-Fold Cross Validation-Presentation.pptxSMOTE and K-Fold Cross Validation-Presentation.pptx
SMOTE and K-Fold Cross Validation-Presentation.pptxHaritikaChhatwal1
 
Conf42-LLM_Adding Generative AI to Real-Time Streaming Pipelines
Conf42-LLM_Adding Generative AI to Real-Time Streaming PipelinesConf42-LLM_Adding Generative AI to Real-Time Streaming Pipelines
Conf42-LLM_Adding Generative AI to Real-Time Streaming PipelinesTimothy Spann
 
Learn How Data Science Changes Our World
Learn How Data Science Changes Our WorldLearn How Data Science Changes Our World
Learn How Data Science Changes Our WorldEduminds Learning
 
Semantic Shed - Squashing and Squeezing.pptx
Semantic Shed - Squashing and Squeezing.pptxSemantic Shed - Squashing and Squeezing.pptx
Semantic Shed - Squashing and Squeezing.pptxMike Bennett
 
convolutional neural network and its applications.pdf
convolutional neural network and its applications.pdfconvolutional neural network and its applications.pdf
convolutional neural network and its applications.pdfSubhamKumar3239
 
Predictive Analysis for Loan Default Presentation : Data Analysis Project PPT
Predictive Analysis for Loan Default  Presentation : Data Analysis Project PPTPredictive Analysis for Loan Default  Presentation : Data Analysis Project PPT
Predictive Analysis for Loan Default Presentation : Data Analysis Project PPTBoston Institute of Analytics
 
Easter Eggs From Star Wars and in cars 1 and 2
Easter Eggs From Star Wars and in cars 1 and 2Easter Eggs From Star Wars and in cars 1 and 2
Easter Eggs From Star Wars and in cars 1 and 217djon017
 
Minimizing AI Hallucinations/Confabulations and the Path towards AGI with Exa...
Minimizing AI Hallucinations/Confabulations and the Path towards AGI with Exa...Minimizing AI Hallucinations/Confabulations and the Path towards AGI with Exa...
Minimizing AI Hallucinations/Confabulations and the Path towards AGI with Exa...Thomas Poetter
 
Unveiling the Role of Social Media Suspect Investigators in Preventing Online...
Unveiling the Role of Social Media Suspect Investigators in Preventing Online...Unveiling the Role of Social Media Suspect Investigators in Preventing Online...
Unveiling the Role of Social Media Suspect Investigators in Preventing Online...Milind Agarwal
 
The Power of Data-Driven Storytelling_ Unveiling the Layers of Insight.pptx
The Power of Data-Driven Storytelling_ Unveiling the Layers of Insight.pptxThe Power of Data-Driven Storytelling_ Unveiling the Layers of Insight.pptx
The Power of Data-Driven Storytelling_ Unveiling the Layers of Insight.pptxTasha Penwell
 
FAIR, FAIRsharing, FAIR Cookbook and ELIXIR - Sansone SA - Boston 2024
FAIR, FAIRsharing, FAIR Cookbook and ELIXIR - Sansone SA - Boston 2024FAIR, FAIRsharing, FAIR Cookbook and ELIXIR - Sansone SA - Boston 2024
FAIR, FAIRsharing, FAIR Cookbook and ELIXIR - Sansone SA - Boston 2024Susanna-Assunta Sansone
 
NO1 Certified Black Magic Specialist Expert Amil baba in Lahore Islamabad Raw...
NO1 Certified Black Magic Specialist Expert Amil baba in Lahore Islamabad Raw...NO1 Certified Black Magic Specialist Expert Amil baba in Lahore Islamabad Raw...
NO1 Certified Black Magic Specialist Expert Amil baba in Lahore Islamabad Raw...Amil Baba Dawood bangali
 
Decoding the Heart: Student Presentation on Heart Attack Prediction with Data...
Decoding the Heart: Student Presentation on Heart Attack Prediction with Data...Decoding the Heart: Student Presentation on Heart Attack Prediction with Data...
Decoding the Heart: Student Presentation on Heart Attack Prediction with Data...Boston Institute of Analytics
 
modul pembelajaran robotic Workshop _ by Slidesgo.pptx
modul pembelajaran robotic Workshop _ by Slidesgo.pptxmodul pembelajaran robotic Workshop _ by Slidesgo.pptx
modul pembelajaran robotic Workshop _ by Slidesgo.pptxaleedritatuxx
 

Dernier (20)

Principles and Practices of Data Visualization
Principles and Practices of Data VisualizationPrinciples and Practices of Data Visualization
Principles and Practices of Data Visualization
 
Student Profile Sample report on improving academic performance by uniting gr...
Student Profile Sample report on improving academic performance by uniting gr...Student Profile Sample report on improving academic performance by uniting gr...
Student Profile Sample report on improving academic performance by uniting gr...
 
Advanced Machine Learning for Business Professionals
Advanced Machine Learning for Business ProfessionalsAdvanced Machine Learning for Business Professionals
Advanced Machine Learning for Business Professionals
 
Student profile product demonstration on grades, ability, well-being and mind...
Student profile product demonstration on grades, ability, well-being and mind...Student profile product demonstration on grades, ability, well-being and mind...
Student profile product demonstration on grades, ability, well-being and mind...
 
SMOTE and K-Fold Cross Validation-Presentation.pptx
SMOTE and K-Fold Cross Validation-Presentation.pptxSMOTE and K-Fold Cross Validation-Presentation.pptx
SMOTE and K-Fold Cross Validation-Presentation.pptx
 
Conf42-LLM_Adding Generative AI to Real-Time Streaming Pipelines
Conf42-LLM_Adding Generative AI to Real-Time Streaming PipelinesConf42-LLM_Adding Generative AI to Real-Time Streaming Pipelines
Conf42-LLM_Adding Generative AI to Real-Time Streaming Pipelines
 
Learn How Data Science Changes Our World
Learn How Data Science Changes Our WorldLearn How Data Science Changes Our World
Learn How Data Science Changes Our World
 
Insurance Churn Prediction Data Analysis Project
Insurance Churn Prediction Data Analysis ProjectInsurance Churn Prediction Data Analysis Project
Insurance Churn Prediction Data Analysis Project
 
Semantic Shed - Squashing and Squeezing.pptx
Semantic Shed - Squashing and Squeezing.pptxSemantic Shed - Squashing and Squeezing.pptx
Semantic Shed - Squashing and Squeezing.pptx
 
convolutional neural network and its applications.pdf
convolutional neural network and its applications.pdfconvolutional neural network and its applications.pdf
convolutional neural network and its applications.pdf
 
Predictive Analysis for Loan Default Presentation : Data Analysis Project PPT
Predictive Analysis for Loan Default  Presentation : Data Analysis Project PPTPredictive Analysis for Loan Default  Presentation : Data Analysis Project PPT
Predictive Analysis for Loan Default Presentation : Data Analysis Project PPT
 
Easter Eggs From Star Wars and in cars 1 and 2
Easter Eggs From Star Wars and in cars 1 and 2Easter Eggs From Star Wars and in cars 1 and 2
Easter Eggs From Star Wars and in cars 1 and 2
 
Minimizing AI Hallucinations/Confabulations and the Path towards AGI with Exa...
Minimizing AI Hallucinations/Confabulations and the Path towards AGI with Exa...Minimizing AI Hallucinations/Confabulations and the Path towards AGI with Exa...
Minimizing AI Hallucinations/Confabulations and the Path towards AGI with Exa...
 
Unveiling the Role of Social Media Suspect Investigators in Preventing Online...
Unveiling the Role of Social Media Suspect Investigators in Preventing Online...Unveiling the Role of Social Media Suspect Investigators in Preventing Online...
Unveiling the Role of Social Media Suspect Investigators in Preventing Online...
 
The Power of Data-Driven Storytelling_ Unveiling the Layers of Insight.pptx
The Power of Data-Driven Storytelling_ Unveiling the Layers of Insight.pptxThe Power of Data-Driven Storytelling_ Unveiling the Layers of Insight.pptx
The Power of Data-Driven Storytelling_ Unveiling the Layers of Insight.pptx
 
FAIR, FAIRsharing, FAIR Cookbook and ELIXIR - Sansone SA - Boston 2024
FAIR, FAIRsharing, FAIR Cookbook and ELIXIR - Sansone SA - Boston 2024FAIR, FAIRsharing, FAIR Cookbook and ELIXIR - Sansone SA - Boston 2024
FAIR, FAIRsharing, FAIR Cookbook and ELIXIR - Sansone SA - Boston 2024
 
NO1 Certified Black Magic Specialist Expert Amil baba in Lahore Islamabad Raw...
NO1 Certified Black Magic Specialist Expert Amil baba in Lahore Islamabad Raw...NO1 Certified Black Magic Specialist Expert Amil baba in Lahore Islamabad Raw...
NO1 Certified Black Magic Specialist Expert Amil baba in Lahore Islamabad Raw...
 
Data Analysis Project: Stroke Prediction
Data Analysis Project: Stroke PredictionData Analysis Project: Stroke Prediction
Data Analysis Project: Stroke Prediction
 
Decoding the Heart: Student Presentation on Heart Attack Prediction with Data...
Decoding the Heart: Student Presentation on Heart Attack Prediction with Data...Decoding the Heart: Student Presentation on Heart Attack Prediction with Data...
Decoding the Heart: Student Presentation on Heart Attack Prediction with Data...
 
modul pembelajaran robotic Workshop _ by Slidesgo.pptx
modul pembelajaran robotic Workshop _ by Slidesgo.pptxmodul pembelajaran robotic Workshop _ by Slidesgo.pptx
modul pembelajaran robotic Workshop _ by Slidesgo.pptx
 

Rで読書Hackと対応分析

  • 1.
  • 2. > me $name [1] "Takashi Kitano" $twitter [1] "@kashitan" $work_in [1] " " -> " ( 6/30)"
  • 5.
  • 6.
  • 7.
  • 9.
  • 10. titles <- c(" ", " ", " ", " ", "7 ") books <- dplyr::tibble( docId = as.character(1:length(titles)), title = titles, author = c(rep(" ", 2), rep(" F ", 2), " R "))
  • 11. books %>% knitr::kable() docId title author 1 2 3 F 4 F 5 7 R
  • 12.
  • 13.
  • 14. sentDf %>% head() %>% knitr::kable() docId sectionId sentenceId sentence 1 1_0006 1_0006_0001 ── 1 1_0006 1_0006_0002 1 1_0006 1_0006_0003 1 1_0006 1_0006_0004 1 1_0006 1_0006_0005 1 1_0006 1_0006_0006
  • 15.
  • 16. tokenDf <- sentDf %>% as.data.frame() %>% RMeCab::RMeCabDF("sentence", 1) %>% purrr::set_names(nm = sentDf$sentenceId) %>% purrr::map2_dfr(.x = ., .y = names(.), .f = function(x, y) { tibble(docId = stringr::str_replace(y, "_.*", ""), sectionId = stringr::str_replace(y, "(.+_.*)_.*", "1"), sentenceId = y, token = x, hinshi = names(x)) })
  • 17. docId sectionId sentenceId token hinshi 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 ─ 1 1_0006 1_0006_0001 ─ 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 tokenDf %>% head(n = 10) %>% knitr::kable()
  • 18.
  • 19.
  • 24. sw <- c(" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ") CA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::count(docId, token) %>% dplyr::ungroup() %>% dplyr::inner_join(books, by = "docId") %>% dplyr::select(token, title, n) %>% tidyr::spread(key = title, value=n, fill = 0) %>% dplyr::mutate(max = pmax(!!!rlang::syms(titles))) %>% dplyr::top_n(n = 60, wt = max) %>% as.data.frame()
  • 25. rownames(CA_in) <- CA_in$token CA_in <- CA_in %>% select(one_of(titles)) CA_in %>% head() %>% knitr::kable() 7 0 1 0 0 269 17 15 166 431 110 0 0 0 0 444 148 116 1 15 538 103 71 69 94 246 182 175 87 87 236
  • 28. d <- dplyr::tibble( x = CA_out$row$coord[, 1], y = CA_out$row$coord[, 2], label = rownames(CA_out$row$coord), type = "row" ) %>% dplyr::bind_rows(tibble( x = CA_out$col$coord[, 1], y = CA_out$col$coord[, 2], label = rownames(CA_out$col$coord), type = "col" ))
  • 29. d %>% head(n = 12) %>% knitr::kable() x y label type -0.4651474 -1.4411540 row 0.9067268 0.1419623 row -0.4639592 -1.4488050 row -0.5503475 -0.6609927 row -0.0656860 -0.2595280 row -0.2077013 0.0100741 row 0.0335994 0.0251759 row -0.5759668 0.0906504 row -0.1080142 0.2464559 row -0.0559726 0.0791701 row
  • 30. labels <- glue::glue("Dim {axis} ({format(var, digits = 3, nsmall = 1)} %)", axis = c(1, 2), var = CA_out$eig[1:2, 2]) d %>% ggplot2::ggplot(aes(x = x, y = y, label = label, shape = type, colour = type)) + ggplot2::geom_vline(xintercept = 0, linetype = "dashed") + ggplot2::geom_hline(yintercept = 0, linetype = "dashed") + ggplot2::geom_point() + ggrepel::geom_text_repel(family = "HiraMaruProN-W4") + ggplot2::scale_shape_manual(values = c(17, 16)) + ggplot2::xlab(labels[1]) + ggplot2::ylab(labels[2]) + ggplot2::ggtitle("CA - Biplot") + ggplot2::theme(legend.position = "none")
  • 31.
  • 32.
  • 34. CA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::count(docId, token) %>% dplyr::ungroup() %>% tidytext::bind_tf_idf(term = token, document = docId, n = n) %>% dplyr::inner_join(books, by = "docId") %>% dplyr::select(token, title, tf_idf) %>% tidyr::spread(key = title, value=tf_idf, fill = 0.0) %>% dplyr::mutate(tfidf_max = pmax(!!!rlang::syms(titles))) %>% dplyr::top_n(n = 60, wt = tfidf_max) %>% as.data.frame()
  • 35. rownames(CA_in) <- CA_in$token CA_in <- CA_in %>% select(one_of(titles)) CA_in %>% head() %>% knitr::kable() 7 0.00E+00 0.0000224 0 0.00E+00 0.0036436 5.91E-05 0.0019978 0 0.00E+00 0.0000151 5.91E-05 0.0019853 0 0.00E+00 0.0000151 0.00E+00 0.0000000 0 0.00E+00 0.0014751 Ⅱ 0.00E+00 0.0000000 0 2.49E-05 0.0008940 0.00E+00 0.0000000 0 0.00E+00 0.0025933
  • 36.
  • 37.
  • 39.
  • 41.
  • 42.
  • 44. LDA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::group_by(docId, token) %>% dplyr::count(token) %>% dplyr::ungroup() %>% dplyr::group_by(token) %>% dplyr::mutate(total = sum(n)) %>% dplyr::ungroup() %>% dplyr::top_n(n = 5000, wt = total) %>% dplyr::inner_join(books, by = "docId") %>% tidytext::cast_dtm(title, token, n)
  • 45. LDA_out <- topicmodels::LDA(LDA_in, k = 5, control = list(seed = 123)) LDA_out %>% tidytext::tidy(matrix = "gamma") %>% ggplot2::ggplot(aes(factor(topic), gamma)) + ggplot2::geom_boxplot() + ggplot2::facet_wrap(~ document) + ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
  • 46.
  • 47. LDA_out %>% tidytext::tidy() %>% dplyr::group_by(topic) %>% dplyr::top_n(n = 10, wt = beta) %>% dplyr::ungroup() %>% dplyr::mutate(term = reorder(term, beta)) %>% ggplot2::ggplot(aes(term, beta, fill = factor(topic))) + ggplot2::geom_col(show.legend = FALSE) + ggplot2::facet_wrap(~ topic, scales = "free_y") + ggplot2::coord_flip() + ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 55.
  • 58.
  • 59.
  • 60. tokenDf <- tokenDf %>% dplyr::filter(docId == "1") %>% dplyr::group_by(sectionId) %>% tidyr::nest() tokenDf %>% head() # A tibble: 6 x 2 sectionId data <chr> <list> 1 1_0006 <tibble [2,642 × 4]> 2 1_0007 <tibble [1,432 × 4]> 3 1_0009 <tibble [5,928 × 4]> 4 1_0010 <tibble [3,755 × 4]> 5 1_0011 <tibble [5,135 × 4]> 6 1_0013 <tibble [3,440 × 4]>
  • 61. tokenDf$data[[1]] %>% head() %>% knitr::kable() docId sentenceId token hinshi 1 1_0006_0001 1 1_0006_0001 ─ 1 1_0006_0001 ─ 1 1_0006_0001 1 1_0006_0001 1 1_0006_0001 tokenDf$data[[2]] %>% head() %>% knitr::kable() docId sentenceId token hinshi 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001
  • 62. simDf <- tokenDf %>% # lexRankr::sentenceSimil() dplyr::mutate(sentenceId = purrr::map(.$data, ~.$sentenceId), token = purrr::map(.$data, ~.$token), docId = purrr::map2(sectionId, token, function(x, y) { rep(x, length(y)) } )) %>% # dplyr::mutate(simil = purrr::pmap(list(sentenceId, token, docId), lexRankr::sentenceSimil)) %>% select(sectionId, simil)
  • 63. simDf %>% head() # A tibble: 6 x 2 sectionId simil <chr> <list> 1 1_0006 <df[,3] [5,671 × 3]> 2 1_0007 <df[,3] [2,080 × 3]> 3 1_0009 <df[,3] [39,340 × 3]> 4 1_0010 <df[,3] [13,695 × 3]> 5 1_0011 <df[,3] [25,651 × 3]> 6 1_0013 <df[,3] [12,880 × 3]>
  • 64. simDf$simil[[1]] %>% head() %>% knitr::kable() sent1 sent2 similVal 1_0006_0001 1_0006_0002 0.5045201 1_0006_0001 1_0006_0003 0.4682931 1_0006_0001 1_0006_0004 0.0000000 1_0006_0001 1_0006_0005 0.5541319 1_0006_0001 1_0006_0006 0.5856045 1_0006_0001 1_0006_0007 0.4752808
  • 65.
  • 66. topNSents <- simDf %>% # lexRankr::lexRankFromSimil dplyr::mutate(s1 = purrr::map(.$simil, ~.$sent1), s2 = purrr::map(.$simil, ~.$sent2), simil = purrr::map(.$simil, ~.$similVal), n = purrr::map(.$sentenceId, function(x) { as.integer(ceiling(dplyr::n_distinct(x) * 0.1)) })) %>% # dplyr::mutate(topN = purrr::pmap(list(s1, s2, simil, n), lexRankr::lexRankFromSimil, threshold = 0.2, continuous = TRUE))
  • 67. topNSents %>% head() # A tibble: 6 x 2 sectionId topN <chr> <list> 1 1_0006 <df[,2] [11 × 2]> 2 1_0007 <df[,2] [7 × 2]> 3 1_0009 <df[,2] [29 × 2]> 4 1_0010 <df[,2] [17 × 2]> 5 1_0011 <df[,2] [23 × 2]> 6 1_0013 <df[,2] [17 × 2]>
  • 68. topNSents$topN[[1]] %>% knitr::kable() sentenceId value 1_0006_0012 0.0110008 1_0006_0025 0.0110065 1_0006_0027 0.0109621 1_0006_0028 0.0110488 1_0006_0033 0.0109785 1_0006_0040 0.0110083 1_0006_0041 0.0110468 1_0006_0047 0.0110727 1_0006_0059 0.0110430 1_0006_0085 0.0110101 1_0006_0086 0.0109732
  • 69. res <- topNSents %>% select(topN) %>% tidyr::unnest() %>% dplyr::inner_join(sentDf, by = “sentenceId") res %>% head() %>% knitr::kable() sentenceI d value docI d section Id sentence 1_0006_0 012 0.0110008 1 1_0006 1_0006_0 025 0.0110065 1 1_0006 1_0006_0 027 0.0109621 1 1_0006 1_0006_0 028 0.0110488 1 1_0006 1_0006_0 033 0.0109785 1 1_0006 1_0006_0 040 0.0110083 1 1_0006 ──
  • 70. sentDf %>% filter(docId == "1") %>% group_by(sectionId) %>% count() # A tibble: 31 x 2 # Groups: sectionId [31] sectionId n <chr> <int> 1 1_0006 107 2 1_0007 65 3 1_0009 281 4 1_0010 166 5 1_0011 227 6 1_0013 161 res %>% filter(docId == "1") %>% group_by(sectionId) %>% count() # A tibble: 31 x 2 # Groups: sectionId [31] sectionId n <chr> <int> 1 1_0006 11 2 1_0007 7 3 1_0009 29 4 1_0010 17 5 1_0011 23 6 1_0013 17