> me
[1] "Takashi Kitano"
[1] "@kashitan"
[1] " " -> " ( 6/30)"
titles <- c(" ", " ",
" ", " ",
"7 ")
books <- dplyr::tibble(
docId = as.character(1:length(titles)),
title = titles,
author = c(rep(" ", 2),
rep(" F ", 2),
" R "))
books %>% knitr::kable()
docId title author
3 F
4 F
5 7 R
sentDf %>% head() %>% knitr::kable()
docId sectionId sentenceId sentence
1 1_0006 1_0006_0001 ──
1 1_0006 1_0006_0002
1 1_0006 1_0006_0003
1 1_0006 1_0006_0004
1 1_0006 1_0006_0005
1 1_0006 1_0006_0006
tokenDf <- sentDf %>% %>%
RMeCab::RMeCabDF("sentence", 1) %>%
purrr::set_names(nm = sentDf$sentenceId) %>%
purrr::map2_dfr(.x = ., .y = names(.), .f = function(x, y) {
tibble(docId = stringr::str_replace(y, "_.*", ""),
sectionId = stringr::str_replace(y, "(.+_.*)_.*", "1"),
sentenceId = y,
token = x,
hinshi = names(x))
docId sectionId sentenceId token hinshi
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001 ─
1 1_0006 1_0006_0001 ─
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001
1 1_0006 1_0006_0001
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
1 1_0006 1_0006_0002
tokenDf %>% head(n = 10) %>% knitr::kable()
sw <- c(" ", " ", " ", " ", " ", " ", " ", " ",
" ", " ", " ")
CA_in <- tokenDf %>%
dplyr::filter(hinshi %in% c(" ", " ")) %>%
dplyr::filter(! token %in% sw) %>%
dplyr::count(docId, token) %>%
dplyr::ungroup() %>%
dplyr::inner_join(books, by = "docId") %>%
dplyr::select(token, title, n) %>%
tidyr::spread(key = title, value=n, fill = 0) %>%
dplyr::mutate(max = pmax(!!!rlang::syms(titles))) %>%
dplyr::top_n(n = 60, wt = max) %>%
rownames(CA_in) <- CA_in$token
CA_in <- CA_in %>%
CA_in %>% head() %>% knitr::kable()
0 1 0 0 269
17 15 166 431 110
0 0 0 0 444
148 116 1 15 538
103 71 69 94 246
182 175 87 87 236
d <- dplyr::tibble(
x = CA_out$row$coord[, 1],
y = CA_out$row$coord[, 2],
label = rownames(CA_out$row$coord),
type = "row"
) %>%
x = CA_out$col$coord[, 1],
y = CA_out$col$coord[, 2],
label = rownames(CA_out$col$coord),
type = "col"
d %>% head(n = 12) %>% knitr::kable()
x y label type
-0.4651474 -1.4411540 row
0.9067268 0.1419623 row
-0.4639592 -1.4488050 row
-0.5503475 -0.6609927 row
-0.0656860 -0.2595280 row
-0.2077013 0.0100741 row
0.0335994 0.0251759 row
-0.5759668 0.0906504 row
-0.1080142 0.2464559 row
-0.0559726 0.0791701 row
labels <- glue::glue("Dim {axis} ({format(var, digits = 3, nsmall =
1)} %)",
axis = c(1, 2), var = CA_out$eig[1:2, 2])
d %>%
ggplot2::ggplot(aes(x = x, y = y, label = label,
shape = type, colour = type)) +
ggplot2::geom_vline(xintercept = 0, linetype = "dashed") +
ggplot2::geom_hline(yintercept = 0, linetype = "dashed") +
ggplot2::geom_point() +
ggrepel::geom_text_repel(family = "HiraMaruProN-W4") +
ggplot2::scale_shape_manual(values = c(17, 16)) +
ggplot2::xlab(labels[1]) +
ggplot2::ylab(labels[2]) +
ggplot2::ggtitle("CA - Biplot") +
ggplot2::theme(legend.position = "none")
CA_in <- tokenDf %>%
dplyr::filter(hinshi %in% c(" ", " ")) %>%
dplyr::filter(! token %in% sw) %>%
dplyr::count(docId, token) %>%
dplyr::ungroup() %>%
tidytext::bind_tf_idf(term = token, document = docId, n = n) %>%
dplyr::inner_join(books, by = "docId") %>%
dplyr::select(token, title, tf_idf) %>%
tidyr::spread(key = title, value=tf_idf, fill = 0.0) %>%
dplyr::mutate(tfidf_max = pmax(!!!rlang::syms(titles))) %>%
dplyr::top_n(n = 60, wt = tfidf_max) %>%
rownames(CA_in) <- CA_in$token
CA_in <- CA_in %>%
CA_in %>% head() %>% knitr::kable()
0.00E+00 0.0000224 0 0.00E+00 0.0036436
5.91E-05 0.0019978 0 0.00E+00 0.0000151
5.91E-05 0.0019853 0 0.00E+00 0.0000151
0.00E+00 0.0000000 0 0.00E+00 0.0014751
Ⅱ 0.00E+00 0.0000000 0 2.49E-05 0.0008940
0.00E+00 0.0000000 0 0.00E+00 0.0025933

LDA_in <- tokenDf %>%
dplyr::filter(hinshi %in% c(" ", " ")) %>%
dplyr::filter(! token %in% sw) %>%
dplyr::group_by(docId, token) %>%
dplyr::count(token) %>%
dplyr::ungroup() %>%
dplyr::group_by(token) %>%
dplyr::mutate(total = sum(n)) %>%
dplyr::ungroup() %>%
dplyr::top_n(n = 5000, wt = total) %>%
dplyr::inner_join(books, by = "docId") %>%
tidytext::cast_dtm(title, token, n)
LDA_out <- topicmodels::LDA(LDA_in, k = 5,
control = list(seed = 123))
LDA_out %>%
tidytext::tidy(matrix = "gamma") %>%
ggplot2::ggplot(aes(factor(topic), gamma)) +
ggplot2::geom_boxplot() +
ggplot2::facet_wrap(~ document) +
ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
LDA_out %>%
tidytext::tidy() %>%
dplyr::group_by(topic) %>%
dplyr::top_n(n = 10, wt = beta) %>%
dplyr::ungroup() %>%
dplyr::mutate(term = reorder(term, beta)) %>%
ggplot2::ggplot(aes(term, beta, fill = factor(topic))) +
ggplot2::geom_col(show.legend = FALSE) +
ggplot2::facet_wrap(~ topic, scales = "free_y") +
ggplot2::coord_flip() +
ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
tokenDf <- tokenDf %>%
dplyr::filter(docId == "1") %>%
dplyr::group_by(sectionId) %>%
tokenDf %>% head()
# A tibble: 6 x 2
sectionId data
<chr> <list>
1 1_0006 <tibble [2,642 × 4]>
2 1_0007 <tibble [1,432 × 4]>
3 1_0009 <tibble [5,928 × 4]>
4 1_0010 <tibble [3,755 × 4]>
5 1_0011 <tibble [5,135 × 4]>
6 1_0013 <tibble [3,440 × 4]>
tokenDf$data[[1]] %>%
head() %>%
docId sentenceId token hinshi
1 1_0006_0001
1 1_0006_0001 ─
1 1_0006_0001 ─
1 1_0006_0001
1 1_0006_0001
1 1_0006_0001
tokenDf$data[[2]] %>%
head() %>%
docId sentenceId token hinshi
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
1 1_0007_0001
simDf <- tokenDf %>%
# lexRankr::sentenceSimil()
dplyr::mutate(sentenceId = purrr::map(.$data, ~.$sentenceId),
token = purrr::map(.$data, ~.$token),
docId = purrr::map2(sectionId, token,
function(x, y) {
rep(x, length(y))
)) %>%
dplyr::mutate(simil = purrr::pmap(list(sentenceId, token, docId),
lexRankr::sentenceSimil)) %>%
select(sectionId, simil)
simDf %>% head()
# A tibble: 6 x 2
sectionId simil
<chr> <list>
1 1_0006 <df[,3] [5,671 × 3]>
2 1_0007 <df[,3] [2,080 × 3]>
3 1_0009 <df[,3] [39,340 × 3]>
4 1_0010 <df[,3] [13,695 × 3]>
5 1_0011 <df[,3] [25,651 × 3]>
6 1_0013 <df[,3] [12,880 × 3]>
simDf$simil[[1]] %>% head() %>% knitr::kable()
sent1 sent2 similVal
1_0006_0001 1_0006_0002 0.5045201
1_0006_0001 1_0006_0003 0.4682931
1_0006_0001 1_0006_0004 0.0000000
1_0006_0001 1_0006_0005 0.5541319
1_0006_0001 1_0006_0006 0.5856045
1_0006_0001 1_0006_0007 0.4752808
topNSents <- simDf %>%
# lexRankr::lexRankFromSimil
dplyr::mutate(s1 = purrr::map(.$simil, ~.$sent1),
s2 = purrr::map(.$simil, ~.$sent2),
simil = purrr::map(.$simil, ~.$similVal),
n = purrr::map(.$sentenceId, function(x) {
as.integer(ceiling(dplyr::n_distinct(x) * 0.1))
})) %>%
dplyr::mutate(topN = purrr::pmap(list(s1, s2, simil, n),
threshold = 0.2,
continuous = TRUE))
topNSents %>% head()
# A tibble: 6 x 2
sectionId topN
<chr> <list>
1 1_0006 <df[,2] [11 × 2]>
2 1_0007 <df[,2] [7 × 2]>
3 1_0009 <df[,2] [29 × 2]>
4 1_0010 <df[,2] [17 × 2]>
5 1_0011 <df[,2] [23 × 2]>
6 1_0013 <df[,2] [17 × 2]>
topNSents$topN[[1]] %>% knitr::kable()
sentenceId value
1_0006_0012 0.0110008
1_0006_0025 0.0110065
1_0006_0027 0.0109621
1_0006_0028 0.0110488
1_0006_0033 0.0109785
1_0006_0040 0.0110083
1_0006_0041 0.0110468
1_0006_0047 0.0110727
1_0006_0059 0.0110430
1_0006_0085 0.0110101
1_0006_0086 0.0109732
res <- topNSents %>%
select(topN) %>%
tidyr::unnest() %>%
dplyr::inner_join(sentDf, by = “sentenceId")
res %>% head() %>% knitr::kable()
value docI
0.0110008 1 1_0006
0.0110065 1 1_0006
0.0109621 1 1_0006
0.0110488 1 1_0006
0.0109785 1 1_0006
0.0110083 1 1_0006 ──
sentDf %>%
filter(docId == "1") %>%
group_by(sectionId) %>%
# A tibble: 31 x 2
# Groups: sectionId [31]
sectionId n
<chr> <int>
1 1_0006 107
2 1_0007 65
3 1_0009 281
4 1_0010 166
5 1_0011 227
6 1_0013 161
res %>%
filter(docId == "1") %>%
group_by(sectionId) %>%
# A tibble: 31 x 2
# Groups: sectionId [31]
sectionId n
<chr> <int>
1 1_0006 11
2 1_0007 7
3 1_0009 29
4 1_0010 17
5 1_0011 23
6 1_0013 17

  • 1.
  • 2. > me $name [1] "Takashi Kitano" $twitter [1] "@kashitan" $work_in [1] " " -> " ( 6/30)"
  • 5.
  • 6.
  • 7.
  • 9.
  • 10. titles <- c(" ", " ", " ", " ", "7 ") books <- dplyr::tibble( docId = as.character(1:length(titles)), title = titles, author = c(rep(" ", 2), rep(" F ", 2), " R "))
  • 11. books %>% knitr::kable() docId title author 1 2 3 F 4 F 5 7 R
  • 12.
  • 13.
  • 14. sentDf %>% head() %>% knitr::kable() docId sectionId sentenceId sentence 1 1_0006 1_0006_0001 ── 1 1_0006 1_0006_0002 1 1_0006 1_0006_0003 1 1_0006 1_0006_0004 1 1_0006 1_0006_0005 1 1_0006 1_0006_0006
  • 15.
  • 16. tokenDf <- sentDf %>% %>% RMeCab::RMeCabDF("sentence", 1) %>% purrr::set_names(nm = sentDf$sentenceId) %>% purrr::map2_dfr(.x = ., .y = names(.), .f = function(x, y) { tibble(docId = stringr::str_replace(y, "_.*", ""), sectionId = stringr::str_replace(y, "(.+_.*)_.*", "1"), sentenceId = y, token = x, hinshi = names(x)) })
  • 17. docId sectionId sentenceId token hinshi 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 ─ 1 1_0006 1_0006_0001 ─ 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 tokenDf %>% head(n = 10) %>% knitr::kable()
  • 18.
  • 19.
  • 24. sw <- c(" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ") CA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::count(docId, token) %>% dplyr::ungroup() %>% dplyr::inner_join(books, by = "docId") %>% dplyr::select(token, title, n) %>% tidyr::spread(key = title, value=n, fill = 0) %>% dplyr::mutate(max = pmax(!!!rlang::syms(titles))) %>% dplyr::top_n(n = 60, wt = max) %>%
  • 25. rownames(CA_in) <- CA_in$token CA_in <- CA_in %>% select(one_of(titles)) CA_in %>% head() %>% knitr::kable() 7 0 1 0 0 269 17 15 166 431 110 0 0 0 0 444 148 116 1 15 538 103 71 69 94 246 182 175 87 87 236
  • 28. d <- dplyr::tibble( x = CA_out$row$coord[, 1], y = CA_out$row$coord[, 2], label = rownames(CA_out$row$coord), type = "row" ) %>% dplyr::bind_rows(tibble( x = CA_out$col$coord[, 1], y = CA_out$col$coord[, 2], label = rownames(CA_out$col$coord), type = "col" ))
  • 29. d %>% head(n = 12) %>% knitr::kable() x y label type -0.4651474 -1.4411540 row 0.9067268 0.1419623 row -0.4639592 -1.4488050 row -0.5503475 -0.6609927 row -0.0656860 -0.2595280 row -0.2077013 0.0100741 row 0.0335994 0.0251759 row -0.5759668 0.0906504 row -0.1080142 0.2464559 row -0.0559726 0.0791701 row
  • 30. labels <- glue::glue("Dim {axis} ({format(var, digits = 3, nsmall = 1)} %)", axis = c(1, 2), var = CA_out$eig[1:2, 2]) d %>% ggplot2::ggplot(aes(x = x, y = y, label = label, shape = type, colour = type)) + ggplot2::geom_vline(xintercept = 0, linetype = "dashed") + ggplot2::geom_hline(yintercept = 0, linetype = "dashed") + ggplot2::geom_point() + ggrepel::geom_text_repel(family = "HiraMaruProN-W4") + ggplot2::scale_shape_manual(values = c(17, 16)) + ggplot2::xlab(labels[1]) + ggplot2::ylab(labels[2]) + ggplot2::ggtitle("CA - Biplot") + ggplot2::theme(legend.position = "none")
  • 31.
  • 32.
  • 34. CA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::count(docId, token) %>% dplyr::ungroup() %>% tidytext::bind_tf_idf(term = token, document = docId, n = n) %>% dplyr::inner_join(books, by = "docId") %>% dplyr::select(token, title, tf_idf) %>% tidyr::spread(key = title, value=tf_idf, fill = 0.0) %>% dplyr::mutate(tfidf_max = pmax(!!!rlang::syms(titles))) %>% dplyr::top_n(n = 60, wt = tfidf_max) %>%
  • 35. rownames(CA_in) <- CA_in$token CA_in <- CA_in %>% select(one_of(titles)) CA_in %>% head() %>% knitr::kable() 7 0.00E+00 0.0000224 0 0.00E+00 0.0036436 5.91E-05 0.0019978 0 0.00E+00 0.0000151 5.91E-05 0.0019853 0 0.00E+00 0.0000151 0.00E+00 0.0000000 0 0.00E+00 0.0014751 Ⅱ 0.00E+00 0.0000000 0 2.49E-05 0.0008940 0.00E+00 0.0000000 0 0.00E+00 0.0025933
  • 36.
  • 37.
  • 39.
  • 41.
  • 42.
  • 44. LDA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::group_by(docId, token) %>% dplyr::count(token) %>% dplyr::ungroup() %>% dplyr::group_by(token) %>% dplyr::mutate(total = sum(n)) %>% dplyr::ungroup() %>% dplyr::top_n(n = 5000, wt = total) %>% dplyr::inner_join(books, by = "docId") %>% tidytext::cast_dtm(title, token, n)
  • 45. LDA_out <- topicmodels::LDA(LDA_in, k = 5, control = list(seed = 123)) LDA_out %>% tidytext::tidy(matrix = "gamma") %>% ggplot2::ggplot(aes(factor(topic), gamma)) + ggplot2::geom_boxplot() + ggplot2::facet_wrap(~ document) + ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
  • 46.
  • 47. LDA_out %>% tidytext::tidy() %>% dplyr::group_by(topic) %>% dplyr::top_n(n = 10, wt = beta) %>% dplyr::ungroup() %>% dplyr::mutate(term = reorder(term, beta)) %>% ggplot2::ggplot(aes(term, beta, fill = factor(topic))) + ggplot2::geom_col(show.legend = FALSE) + ggplot2::facet_wrap(~ topic, scales = "free_y") + ggplot2::coord_flip() + ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 55.
  • 58.
  • 59.
  • 60. tokenDf <- tokenDf %>% dplyr::filter(docId == "1") %>% dplyr::group_by(sectionId) %>% tidyr::nest() tokenDf %>% head() # A tibble: 6 x 2 sectionId data <chr> <list> 1 1_0006 <tibble [2,642 × 4]> 2 1_0007 <tibble [1,432 × 4]> 3 1_0009 <tibble [5,928 × 4]> 4 1_0010 <tibble [3,755 × 4]> 5 1_0011 <tibble [5,135 × 4]> 6 1_0013 <tibble [3,440 × 4]>
  • 61. tokenDf$data[[1]] %>% head() %>% knitr::kable() docId sentenceId token hinshi 1 1_0006_0001 1 1_0006_0001 ─ 1 1_0006_0001 ─ 1 1_0006_0001 1 1_0006_0001 1 1_0006_0001 tokenDf$data[[2]] %>% head() %>% knitr::kable() docId sentenceId token hinshi 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001
  • 62. simDf <- tokenDf %>% # lexRankr::sentenceSimil() dplyr::mutate(sentenceId = purrr::map(.$data, ~.$sentenceId), token = purrr::map(.$data, ~.$token), docId = purrr::map2(sectionId, token, function(x, y) { rep(x, length(y)) } )) %>% # dplyr::mutate(simil = purrr::pmap(list(sentenceId, token, docId), lexRankr::sentenceSimil)) %>% select(sectionId, simil)
  • 63. simDf %>% head() # A tibble: 6 x 2 sectionId simil <chr> <list> 1 1_0006 <df[,3] [5,671 × 3]> 2 1_0007 <df[,3] [2,080 × 3]> 3 1_0009 <df[,3] [39,340 × 3]> 4 1_0010 <df[,3] [13,695 × 3]> 5 1_0011 <df[,3] [25,651 × 3]> 6 1_0013 <df[,3] [12,880 × 3]>
  • 64. simDf$simil[[1]] %>% head() %>% knitr::kable() sent1 sent2 similVal 1_0006_0001 1_0006_0002 0.5045201 1_0006_0001 1_0006_0003 0.4682931 1_0006_0001 1_0006_0004 0.0000000 1_0006_0001 1_0006_0005 0.5541319 1_0006_0001 1_0006_0006 0.5856045 1_0006_0001 1_0006_0007 0.4752808
  • 65.
  • 66. topNSents <- simDf %>% # lexRankr::lexRankFromSimil dplyr::mutate(s1 = purrr::map(.$simil, ~.$sent1), s2 = purrr::map(.$simil, ~.$sent2), simil = purrr::map(.$simil, ~.$similVal), n = purrr::map(.$sentenceId, function(x) { as.integer(ceiling(dplyr::n_distinct(x) * 0.1)) })) %>% # dplyr::mutate(topN = purrr::pmap(list(s1, s2, simil, n), lexRankr::lexRankFromSimil, threshold = 0.2, continuous = TRUE))
  • 67. topNSents %>% head() # A tibble: 6 x 2 sectionId topN <chr> <list> 1 1_0006 <df[,2] [11 × 2]> 2 1_0007 <df[,2] [7 × 2]> 3 1_0009 <df[,2] [29 × 2]> 4 1_0010 <df[,2] [17 × 2]> 5 1_0011 <df[,2] [23 × 2]> 6 1_0013 <df[,2] [17 × 2]>
  • 68. topNSents$topN[[1]] %>% knitr::kable() sentenceId value 1_0006_0012 0.0110008 1_0006_0025 0.0110065 1_0006_0027 0.0109621 1_0006_0028 0.0110488 1_0006_0033 0.0109785 1_0006_0040 0.0110083 1_0006_0041 0.0110468 1_0006_0047 0.0110727 1_0006_0059 0.0110430 1_0006_0085 0.0110101 1_0006_0086 0.0109732
  • 69. res <- topNSents %>% select(topN) %>% tidyr::unnest() %>% dplyr::inner_join(sentDf, by = “sentenceId") res %>% head() %>% knitr::kable() sentenceI d value docI d section Id sentence 1_0006_0 012 0.0110008 1 1_0006 1_0006_0 025 0.0110065 1 1_0006 1_0006_0 027 0.0109621 1 1_0006 1_0006_0 028 0.0110488 1 1_0006 1_0006_0 033 0.0109785 1 1_0006 1_0006_0 040 0.0110083 1 1_0006 ──
  • 70. sentDf %>% filter(docId == "1") %>% group_by(sectionId) %>% count() # A tibble: 31 x 2 # Groups: sectionId [31] sectionId n <chr> <int> 1 1_0006 107 2 1_0007 65 3 1_0009 281 4 1_0010 166 5 1_0011 227 6 1_0013 161 res %>% filter(docId == "1") %>% group_by(sectionId) %>% count() # A tibble: 31 x 2 # Groups: sectionId [31] sectionId n <chr> <int> 1 1_0006 11 2 1_0007 7 3 1_0009 29 4 1_0010 17 5 1_0011 23 6 1_0013 17