Rで読書Hackと対応分析

> me
$name
[1] "Takashi Kitano"
$twitter
[1] "@kashitan"
$work_in
[1] " " -> " ( 6/30)"

新しい時代が
始まって2ヶ⽉
みなさまいかが
お過ごしでしょうか？

⼼機⼀転した⽅も
多いのでは

そうだ
Rで読書Hack
しよう

titles <- c(" ", " ",
" ", " ",
"7 ")
books <- dplyr::tibble(
docId = as.character(1:length(titles)),
title = titles,
author = c(rep(" ", 2),
rep(" F ", 2),
" R "))

books %>% knitr::kable()
docId title author
1
2
3 F
4 F
5 7 R

sentDf %>% head() %>% knitr::kable()
docId sectionId sentenceId sentence
1 1_0006 1_0006_0001 ──
1 1_0006 1_0006_0002
1 1_0006 1_0006_0003
1 1_0006 1_0006_0004
1 1_0006 1_0006_0005
1 1_0006 1_0006_0006

tokenDf <- sentDf %>%
as.data.frame() %>%
RMeCab::RMeCabDF("sentence", 1) %>%
purrr::set_names(nm = sentDf$sentenceId) %>%
purrr::map2_dfr(.x = ., .y = names(.), .f = function(x, y) {
tibble(docId = stringr::str_replace(y, "_.*", ""),
sectionId = stringr::str_replace(y, "(.+_.*)_.*", "1"),
sentenceId = y,
token = x,
hinshi = names(x))
})

docId sectionId sentenceId token hinshi
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001 ─
1 1_0006 1_0006_0001 ─
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
tokenDf %>% head(n = 10) %>% knitr::kable()

https://www.intage.co.jp/glossary/400/

https://www.gastonsanchez.com/visually-enforced/how-to/2012/07/19/Correspondence-Analysis/

sw <- c(" ", " ", " ", " ", " ", " ", " ", " ",
" ", " ", " ")
CA_in <- tokenDf %>%
dplyr::filter(hinshi %in% c(" ", " ")) %>%
dplyr::filter(! token %in% sw) %>%
dplyr::count(docId, token) %>%
dplyr::ungroup() %>%
dplyr::inner_join(books, by = "docId") %>%
dplyr::select(token, title, n) %>%
tidyr::spread(key = title, value=n, fill = 0) %>%
dplyr::mutate(max = pmax(!!!rlang::syms(titles))) %>%
dplyr::top_n(n = 60, wt = max) %>%
as.data.frame()

rownames(CA_in) <- CA_in$token
CA_in <- CA_in %>%
select(one_of(titles))
CA_in %>% head() %>% knitr::kable()
7
0 1 0 0 269
17 15 166 431 110
0 0 0 0 444
148 116 1 15 538
103 71 69 94 246
182 175 87 87 236

『対応分析⼊⾨』の著者・藤本先⽣が
issue上げるも2年近く放置。。。

d <- dplyr::tibble(
x = CA_out$row$coord[, 1],
y = CA_out$row$coord[, 2],
label = rownames(CA_out$row$coord),
type = "row"
) %>%
dplyr::bind_rows(tibble(
x = CA_out$col$coord[, 1],
y = CA_out$col$coord[, 2],
label = rownames(CA_out$col$coord),
type = "col"
))

d %>% head(n = 12) %>% knitr::kable()
x y label type
-0.4651474 -1.4411540 row
0.9067268 0.1419623 row
-0.4639592 -1.4488050 row
-0.5503475 -0.6609927 row
-0.0656860 -0.2595280 row
-0.2077013 0.0100741 row
0.0335994 0.0251759 row
-0.5759668 0.0906504 row
-0.1080142 0.2464559 row
-0.0559726 0.0791701 row

labels <- glue::glue("Dim {axis} ({format(var, digits = 3, nsmall =
1)} %)",
axis = c(1, 2), var = CA_out$eig[1:2, 2])
d %>%
ggplot2::ggplot(aes(x = x, y = y, label = label,
shape = type, colour = type)) +
ggplot2::geom_vline(xintercept = 0, linetype = "dashed") +
ggplot2::geom_hline(yintercept = 0, linetype = "dashed") +
ggplot2::geom_point() +
ggrepel::geom_text_repel(family = "HiraMaruProN-W4") +
ggplot2::scale_shape_manual(values = c(17, 16)) +
ggplot2::xlab(labels[1]) +
ggplot2::ylab(labels[2]) +
ggplot2::ggtitle("CA - Biplot") +
ggplot2::theme(legend.position = "none")

TF-IDFで
対応分析しよう

CA_in <- tokenDf %>%
dplyr::count(docId, token) %>%
tidytext::bind_tf_idf(term = token, document = docId, n = n) %>%
dplyr::select(token, title, tf_idf) %>%
tidyr::spread(key = title, value=tf_idf, fill = 0.0) %>%
dplyr::mutate(tfidf_max = pmax(!!!rlang::syms(titles))) %>%
dplyr::top_n(n = 60, wt = tfidf_max) %>%
as.data.frame()

rownames(CA_in) <- CA_in$token
CA_in <- CA_in %>%
select(one_of(titles))
CA_in %>% head() %>% knitr::kable()
7
0.00E+00 0.0000224 0 0.00E+00 0.0036436
5.91E-05 0.0019978 0 0.00E+00 0.0000151
5.91E-05 0.0019853 0 0.00E+00 0.0000151
0.00E+00 0.0000000 0 0.00E+00 0.0014751
Ⅱ 0.00E+00 0.0000000 0 2.49E-05 0.0008940
0.00E+00 0.0000000 0 0.00E+00 0.0025933

トピックモデル
で分析してみよう

https://www.albert2005.co.jp/knowledge/machine_learning/topic_model/about_topic_model

LDA_in <- tokenDf %>%
dplyr::group_by(docId, token) %>%
dplyr::count(token) %>%
dplyr::group_by(token) %>%
dplyr::mutate(total = sum(n)) %>%
dplyr::top_n(n = 5000, wt = total) %>%
tidytext::cast_dtm(title, token, n)

LDA_out <- topicmodels::LDA(LDA_in, k = 5,
control = list(seed = 123))
LDA_out %>%
tidytext::tidy(matrix = "gamma") %>%
ggplot2::ggplot(aes(factor(topic), gamma)) +
ggplot2::geom_boxplot() +
ggplot2::facet_wrap(~ document) +
ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")

LDA_out %>%
tidytext::tidy() %>%
dplyr::group_by(topic) %>%
dplyr::top_n(n = 10, wt = beta) %>%
dplyr::mutate(term = reorder(term, beta)) %>%
ggplot2::ggplot(aes(term, beta, fill = factor(topic))) +
ggplot2::geom_col(show.legend = FALSE) +
ggplot2::facet_wrap(~ topic, scales = "free_y") +
ggplot2::coord_flip() +
ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")

https://logmi.jp/business/articles/156592

https://qiita.com/icoxfog417/items/d06651db10e27220c819

tokenDf <- tokenDf %>%
dplyr::filter(docId == "1") %>%
dplyr::group_by(sectionId) %>%
tidyr::nest()
tokenDf %>% head()
# A tibble: 6 x 2
sectionId data
<chr> <list>
1 1_0006 <tibble [2,642 × 4]>
2 1_0007 <tibble [1,432 × 4]>
3 1_0009 <tibble [5,928 × 4]>
4 1_0010 <tibble [3,755 × 4]>
5 1_0011 <tibble [5,135 × 4]>
6 1_0013 <tibble [3,440 × 4]>

tokenDf$data[[1]] %>%
head() %>%
knitr::kable()
docId sentenceId token hinshi
1 1_0006_0001
1 1_0006_0001 ─
1 1_0006_0001 ─
1 1_0006_0001
1 1_0006_0001
1 1_0006_0001
tokenDf$data[[2]] %>%
head() %>%
knitr::kable()
docId sentenceId token hinshi
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001

simDf <- tokenDf %>%
# lexRankr::sentenceSimil()
dplyr::mutate(sentenceId = purrr::map(.$data, ~.$sentenceId),
token = purrr::map(.$data, ~.$token),
docId = purrr::map2(sectionId, token,
function(x, y) {
rep(x, length(y))
}
)) %>%
#
dplyr::mutate(simil = purrr::pmap(list(sentenceId, token, docId),
lexRankr::sentenceSimil)) %>%
select(sectionId, simil)

simDf %>% head()
# A tibble: 6 x 2
sectionId simil
<chr> <list>
1 1_0006 <df[,3] [5,671 × 3]>
2 1_0007 <df[,3] [2,080 × 3]>
3 1_0009 <df[,3] [39,340 × 3]>
4 1_0010 <df[,3] [13,695 × 3]>
5 1_0011 <df[,3] [25,651 × 3]>
6 1_0013 <df[,3] [12,880 × 3]>

simDf$simil[[1]] %>% head() %>% knitr::kable()
sent1 sent2 similVal
1_0006_0001 1_0006_0002 0.5045201
1_0006_0001 1_0006_0003 0.4682931
1_0006_0001 1_0006_0004 0.0000000
1_0006_0001 1_0006_0005 0.5541319
1_0006_0001 1_0006_0006 0.5856045
1_0006_0001 1_0006_0007 0.4752808

topNSents <- simDf %>%
# lexRankr::lexRankFromSimil
dplyr::mutate(s1 = purrr::map(.$simil, ~.$sent1),
s2 = purrr::map(.$simil, ~.$sent2),
simil = purrr::map(.$simil, ~.$similVal),
n = purrr::map(.$sentenceId, function(x) {
as.integer(ceiling(dplyr::n_distinct(x) * 0.1))
})) %>%
#
dplyr::mutate(topN = purrr::pmap(list(s1, s2, simil, n),
lexRankr::lexRankFromSimil,
threshold = 0.2,
continuous = TRUE))

topNSents %>% head()
# A tibble: 6 x 2
sectionId topN
<chr> <list>
1 1_0006 <df[,2] [11 × 2]>
2 1_0007 <df[,2] [7 × 2]>
3 1_0009 <df[,2] [29 × 2]>
4 1_0010 <df[,2] [17 × 2]>
5 1_0011 <df[,2] [23 × 2]>
6 1_0013 <df[,2] [17 × 2]>

topNSents$topN[[1]] %>% knitr::kable()
sentenceId value
1_0006_0012 0.0110008
1_0006_0025 0.0110065
1_0006_0027 0.0109621
1_0006_0028 0.0110488
1_0006_0033 0.0109785
1_0006_0040 0.0110083
1_0006_0041 0.0110468
1_0006_0047 0.0110727
1_0006_0059 0.0110430
1_0006_0085 0.0110101
1_0006_0086 0.0109732

res <- topNSents %>%
select(topN) %>%
tidyr::unnest() %>%
dplyr::inner_join(sentDf, by = “sentenceId")
res %>% head() %>% knitr::kable()
sentenceI
d
value docI
d
section
Id
sentence
1_0006_0
012
0.0110008 1 1_0006
1_0006_0
025
0.0110065 1 1_0006
1_0006_0
027
0.0109621 1 1_0006
1_0006_0
028
0.0110488 1 1_0006
1_0006_0
033
0.0109785 1 1_0006
1_0006_0
040
0.0110083 1 1_0006 ──

sentDf %>%
filter(docId == "1") %>%
group_by(sectionId) %>%
count()
# A tibble: 31 x 2
# Groups: sectionId [31]
sectionId n
<chr> <int>
1 1_0006 107
2 1_0007 65
3 1_0009 281
4 1_0010 166
5 1_0011 227
6 1_0013 161
res %>%
filter(docId == "1") %>%
group_by(sectionId) %>%
count()
# A tibble: 31 x 2
# Groups: sectionId [31]
sectionId n
<chr> <int>
1 1_0006 11
2 1_0007 7
3 1_0009 29
4 1_0010 17
5 1_0011 23
6 1_0013 17

Rで読書Hackと対応分析

Recommandé

Recommandé

Contenu connexe

Tendances

Tendances (18)

Similaire à Rで読書Hackと対応分析

Similaire à Rで読書Hackと対応分析 (20)

Plus de Takashi Kitano

Plus de Takashi Kitano (10)

Dernier

Dernier (20)

Rで読書Hackと対応分析