SlideShare a Scribd company logo
1 of 33
Download to read offline
> me
$name
[1] "Takashi Kitano"
$twitter
[1] "@kashitan"
$work_in
[1] " "
前回の神会




> head(talks_en)
# A tibble: 6 x 2
title_en transcript_en
<chr> <chr>
1 Fake videos of real people — and how to … Look at these images. Now…
2 How to build synthetic DNA and send it a… "Alright, let me tell you…
3 Technology that knows what you're feeling "What happens when techno…
4 How to get empowered, not overpowered, b… "After 13.8 billion years…
5 Where joy hides and how to find it "It's 2008, and I'm just …
6 ""You Found Me"" (Cello music starts) You …
> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
# A tibble: 9,840 x 2
title_en word
<chr> <chr>
1 Fake videos of real people — and how to spot them look
2 Fake videos of real people — and how to spot them at
3 Fake videos of real people — and how to spot them these
4 Fake videos of real people — and how to spot them images
5 Fake videos of real people — and how to spot them now
6 Fake videos of real people — and how to spot them tell
7 Fake videos of real people — and how to spot them me
8 Fake videos of real people — and how to spot them which
9 Fake videos of real people — and how to spot them obama
10 Fake videos of real people — and how to spot them here
> talks_ja %>% head()
# A tibble: 6 x 2
title_ja transcript_ja
<chr> <chr>
1 … …
2 DNA … …
3 …
4 AI AI … 138 …
5 2008 …
6 You Found Me … …
> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows
からの罠
> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows
> mecab_result <- talks_ja %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 1
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
> mecab_result <- talks_ja %>%
+ as.data.frame() %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 2551
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
$ : Named chr [1:2903] " " " " " " " " ...
..- attr(*, "names")= chr [1:2903] " " " " " " " " ...
$ : Named chr [1:2208] " " " " " " " " ...
..- attr(*, "names")= chr [1:2208] " " " " " " " " ...
> # tibble 1 tibble
> class(talks_ja[, "transcript_ja"])
[1] "tbl_df" "tbl" "data.frame"
> # 1
> length(talks_ja[, "transcript_ja"])
[1] 1
> # data.frame 1
> class(as.data.frame(talks_ja)[, "transcript_ja"])
[1] "character"
> #
> length(as.data.frame(talks_ja)[, "transcript_ja"])
[1] 2551
> tokens_ja <- purrr::pmap_df(list(nv = mecab_result,
+ title = talks_ja$title_ja),
+ function(nv, title){
+ tibble(title = title,
+ word = nv,
+ hinshi = names(nv))
+ })
> tokens_ja
# A tibble: 6,483,469 x 3
title word hinshi
<chr> <chr> <chr>
1
2
3
4
5
6
# ... with 6,483,463 more rows
> bigram_en <- talks_en %>% select(title_en, transcript_en) %>%
+ tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams",
n = 2)
> head(bigram_en)
# A tibble: 6 x 2
title_en bigram
<chr> <chr>
1 ""(Nothing But) Flowers" with string quartet" music here
2 ""(Nothing But) Flowers" with string quartet" here we
3 ""(Nothing But) Flowers" with string quartet" we stand
4 ""(Nothing But) Flowers" with string quartet" stand like
5 ""(Nothing But) Flowers" with string quartet" like an
6 ""(Nothing But) Flowers" with string quartet" an adam
> bigram_ja <- talks_ja %>%
+ as.data.frame() %>%
+ docDF(col = "transcript_ja", type=1, N = 2)
number of extracted terms = 898167
now making a data frame. wait a while!
> bigram_ja.bk %>%
+ select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7,
Row8, Row9) %>%
+ head()
TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9
1 !-( - 0 0 0 0 0 0 0 0 0
2 !-7 - 0 0 0 0 0 0 0 0 0
3 !-Google - 0 0 0 0 0 0 0 0 0
4 !-Little - 0 0 0 0 0 0 0 0 0
5 !-Time - 0 0 0 0 0 0 0 0 0
6 !-Toonchi - 0 0 0 0 0 0 0 0 0
> bigram_ja <- tokens_ja %>%
+ group_by(title) %>%
+ rename(word1 = word,
+ hinshi1 = hinshi) %>%
+ mutate(word2 = lead(word1),
+ hinshi2 = lead(hinshi1)) %>%
+ ungroup() %>%
+ filter(!is.na(word2)) %>%
+ select(title, word1, word2, hinshi1, hinshi2)
> bigram_ja
# A tibble: 6,480,920 x 5
title word1 word2 hinshi1 hinshi2
<chr> <chr> <chr> <chr> <chr>
1
2
3
4
5
6
7
8
# ... with 6,480,912 more rows




{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

More Related Content

What's hot

グラフィカル Lasso を用いた異常検知
グラフィカル Lasso を用いた異常検知グラフィカル Lasso を用いた異常検知
グラフィカル Lasso を用いた異常検知Yuya Takashina
 
不均衡データのクラス分類
不均衡データのクラス分類不均衡データのクラス分類
不均衡データのクラス分類Shintaro Fukushima
 
劣モジュラ最適化と機械学習1章
劣モジュラ最適化と機械学習1章劣モジュラ最適化と機械学習1章
劣モジュラ最適化と機械学習1章Hakky St
 
最適化計算の概要まとめ
最適化計算の概要まとめ最適化計算の概要まとめ
最適化計算の概要まとめYuichiro MInato
 
ベイズ最適化
ベイズ最適化ベイズ最適化
ベイズ最適化MatsuiRyo
 
Rでのtry関数によるエラー処理
Rでのtry関数によるエラー処理Rでのtry関数によるエラー処理
Rでのtry関数によるエラー処理wada, kazumi
 
不遇の標準ライブラリ - valarray
不遇の標準ライブラリ - valarray不遇の標準ライブラリ - valarray
不遇の標準ライブラリ - valarrayRyosuke839
 
PRML第6章「カーネル法」
PRML第6章「カーネル法」PRML第6章「カーネル法」
PRML第6章「カーネル法」Keisuke Sugawara
 
ブラックボックス最適化とその応用
ブラックボックス最適化とその応用ブラックボックス最適化とその応用
ブラックボックス最適化とその応用gree_tech
 
数学カフェ 確率・統計・機械学習回 「速習 確率・統計」
数学カフェ 確率・統計・機械学習回 「速習 確率・統計」数学カフェ 確率・統計・機械学習回 「速習 確率・統計」
数学カフェ 確率・統計・機械学習回 「速習 確率・統計」Ken'ichi Matsui
 
Rにおける大規模データ解析(第10回TokyoWebMining)
Rにおける大規模データ解析(第10回TokyoWebMining)Rにおける大規模データ解析(第10回TokyoWebMining)
Rにおける大規模データ解析(第10回TokyoWebMining)Shintaro Fukushima
 
パターン認識と機械学習 13章 系列データ
パターン認識と機械学習 13章 系列データパターン認識と機械学習 13章 系列データ
パターン認識と機械学習 13章 系列データemonosuke
 
Stanの便利な事後処理関数
Stanの便利な事後処理関数Stanの便利な事後処理関数
Stanの便利な事後処理関数daiki hojo
 
Cmdstanr入門とreduce_sum()解説
Cmdstanr入門とreduce_sum()解説Cmdstanr入門とreduce_sum()解説
Cmdstanr入門とreduce_sum()解説Hiroshi Shimizu
 
アンサンブル学習
アンサンブル学習アンサンブル学習
アンサンブル学習Hidekazu Tanaka
 
ブースティング入門
ブースティング入門ブースティング入門
ブースティング入門Retrieva inc.
 
勾配ブースティングの基礎と最新の動向 (MIRU2020 Tutorial)
勾配ブースティングの基礎と最新の動向 (MIRU2020 Tutorial)勾配ブースティングの基礎と最新の動向 (MIRU2020 Tutorial)
勾配ブースティングの基礎と最新の動向 (MIRU2020 Tutorial)RyuichiKanoh
 
【読書会資料】『StanとRでベイズ統計モデリング』Chapter12:時間や空間を扱うモデル
【読書会資料】『StanとRでベイズ統計モデリング』Chapter12:時間や空間を扱うモデル【読書会資料】『StanとRでベイズ統計モデリング』Chapter12:時間や空間を扱うモデル
【読書会資料】『StanとRでベイズ統計モデリング』Chapter12:時間や空間を扱うモデルMasashi Komori
 
Granger因果による 時系列データの因果推定(因果フェス2015)
Granger因果による時系列データの因果推定(因果フェス2015)Granger因果による時系列データの因果推定(因果フェス2015)
Granger因果による 時系列データの因果推定(因果フェス2015)Takashi J OZAKI
 

What's hot (20)

coordinate descent 法について
coordinate descent 法についてcoordinate descent 法について
coordinate descent 法について
 
グラフィカル Lasso を用いた異常検知
グラフィカル Lasso を用いた異常検知グラフィカル Lasso を用いた異常検知
グラフィカル Lasso を用いた異常検知
 
不均衡データのクラス分類
不均衡データのクラス分類不均衡データのクラス分類
不均衡データのクラス分類
 
劣モジュラ最適化と機械学習1章
劣モジュラ最適化と機械学習1章劣モジュラ最適化と機械学習1章
劣モジュラ最適化と機械学習1章
 
最適化計算の概要まとめ
最適化計算の概要まとめ最適化計算の概要まとめ
最適化計算の概要まとめ
 
ベイズ最適化
ベイズ最適化ベイズ最適化
ベイズ最適化
 
Rでのtry関数によるエラー処理
Rでのtry関数によるエラー処理Rでのtry関数によるエラー処理
Rでのtry関数によるエラー処理
 
不遇の標準ライブラリ - valarray
不遇の標準ライブラリ - valarray不遇の標準ライブラリ - valarray
不遇の標準ライブラリ - valarray
 
PRML第6章「カーネル法」
PRML第6章「カーネル法」PRML第6章「カーネル法」
PRML第6章「カーネル法」
 
ブラックボックス最適化とその応用
ブラックボックス最適化とその応用ブラックボックス最適化とその応用
ブラックボックス最適化とその応用
 
数学カフェ 確率・統計・機械学習回 「速習 確率・統計」
数学カフェ 確率・統計・機械学習回 「速習 確率・統計」数学カフェ 確率・統計・機械学習回 「速習 確率・統計」
数学カフェ 確率・統計・機械学習回 「速習 確率・統計」
 
Rにおける大規模データ解析(第10回TokyoWebMining)
Rにおける大規模データ解析(第10回TokyoWebMining)Rにおける大規模データ解析(第10回TokyoWebMining)
Rにおける大規模データ解析(第10回TokyoWebMining)
 
パターン認識と機械学習 13章 系列データ
パターン認識と機械学習 13章 系列データパターン認識と機械学習 13章 系列データ
パターン認識と機械学習 13章 系列データ
 
Stanの便利な事後処理関数
Stanの便利な事後処理関数Stanの便利な事後処理関数
Stanの便利な事後処理関数
 
Cmdstanr入門とreduce_sum()解説
Cmdstanr入門とreduce_sum()解説Cmdstanr入門とreduce_sum()解説
Cmdstanr入門とreduce_sum()解説
 
アンサンブル学習
アンサンブル学習アンサンブル学習
アンサンブル学習
 
ブースティング入門
ブースティング入門ブースティング入門
ブースティング入門
 
勾配ブースティングの基礎と最新の動向 (MIRU2020 Tutorial)
勾配ブースティングの基礎と最新の動向 (MIRU2020 Tutorial)勾配ブースティングの基礎と最新の動向 (MIRU2020 Tutorial)
勾配ブースティングの基礎と最新の動向 (MIRU2020 Tutorial)
 
【読書会資料】『StanとRでベイズ統計モデリング』Chapter12:時間や空間を扱うモデル
【読書会資料】『StanとRでベイズ統計モデリング』Chapter12:時間や空間を扱うモデル【読書会資料】『StanとRでベイズ統計モデリング』Chapter12:時間や空間を扱うモデル
【読書会資料】『StanとRでベイズ統計モデリング』Chapter12:時間や空間を扱うモデル
 
Granger因果による 時系列データの因果推定(因果フェス2015)
Granger因果による時系列データの因果推定(因果フェス2015)Granger因果による時系列データの因果推定(因果フェス2015)
Granger因果による 時系列データの因果推定(因果フェス2015)
 

Similar to {tidytext}と{RMeCab}によるモダンな日本語テキスト分析

PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPlotly
 
pa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text Processingpa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text ProcessingRodrigo Senra
 
令和から本気出す
令和から本気出す令和から本気出す
令和から本気出すTakashi Kitano
 
Pre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPaweł Dawczak
 
Learn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesLearn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesMatt Harrison
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonMoses Boudourides
 
Derrubando mitos em Python
Derrubando mitos em PythonDerrubando mitos em Python
Derrubando mitos em PythonDenis Costa
 
Beautiful python - PyLadies
Beautiful python - PyLadiesBeautiful python - PyLadies
Beautiful python - PyLadiesAlicia Pérez
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Pythonpugpe
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01Raman Kannan
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfannikasarees
 
Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1Johan Blomme
 
Helvetia
HelvetiaHelvetia
HelvetiaESUG
 
Easy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtraEasy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtraBarry DeCicco
 
RではじめるTwitter解析
RではじめるTwitter解析RではじめるTwitter解析
RではじめるTwitter解析Takeshi Arabiki
 

Similar to {tidytext}と{RMeCab}によるモダンな日本語テキスト分析 (20)

PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
 
pa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text Processingpa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text Processing
 
令和から本気出す
令和から本気出す令和から本気出す
令和から本気出す
 
Pre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to Elixir
 
Learn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesLearn 90% of Python in 90 Minutes
Learn 90% of Python in 90 Minutes
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την Python
 
Derrubando mitos em Python
Derrubando mitos em PythonDerrubando mitos em Python
Derrubando mitos em Python
 
Elixir
ElixirElixir
Elixir
 
Beautiful python - PyLadies
Beautiful python - PyLadiesBeautiful python - PyLadies
Beautiful python - PyLadies
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Python
 
R programming language
R programming languageR programming language
R programming language
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdf
 
Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1
 
Python 1
Python 1Python 1
Python 1
 
Py ohio
Py ohioPy ohio
Py ohio
 
Helvetia
HelvetiaHelvetia
Helvetia
 
Easy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtraEasy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtra
 
RではじめるTwitter解析
RではじめるTwitter解析RではじめるTwitter解析
RではじめるTwitter解析
 
Delete statement in PHP
Delete statement in PHPDelete statement in PHP
Delete statement in PHP
 

More from Takashi Kitano

好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜Takashi Kitano
 
{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発Tips{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発TipsTakashi Kitano
 
{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析Takashi Kitano
 
20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門Takashi Kitano
 
mxnetで頑張る深層学習
mxnetで頑張る深層学習mxnetで頑張る深層学習
mxnetで頑張る深層学習Takashi Kitano
 
可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜Takashi Kitano
 
Rによるウイスキー分析
Rによるウイスキー分析Rによるウイスキー分析
Rによるウイスキー分析Takashi Kitano
 
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開verTakashi Kitano
 
20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyorTakashi Kitano
 
lubridateパッケージ入門
lubridateパッケージ入門lubridateパッケージ入門
lubridateパッケージ入門Takashi Kitano
 
Google's r style guideのすゝめ
Google's r style guideのすゝめGoogle's r style guideのすゝめ
Google's r style guideのすゝめTakashi Kitano
 

More from Takashi Kitano (13)

好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
 
{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発Tips{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発Tips
 
{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析
 
20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門
 
mxnetで頑張る深層学習
mxnetで頑張る深層学習mxnetで頑張る深層学習
mxnetで頑張る深層学習
 
可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜
 
Rによるウイスキー分析
Rによるウイスキー分析Rによるウイスキー分析
Rによるウイスキー分析
 
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
 
20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor
 
lubridateパッケージ入門
lubridateパッケージ入門lubridateパッケージ入門
lubridateパッケージ入門
 
20150329 tokyo r47
20150329 tokyo r4720150329 tokyo r47
20150329 tokyo r47
 
20140920 tokyo r43
20140920 tokyo r4320140920 tokyo r43
20140920 tokyo r43
 
Google's r style guideのすゝめ
Google's r style guideのすゝめGoogle's r style guideのすゝめ
Google's r style guideのすゝめ
 

Recently uploaded

DATA ANALYSIS using various data sets like shoping data set etc
DATA ANALYSIS using various data sets like shoping data set etcDATA ANALYSIS using various data sets like shoping data set etc
DATA ANALYSIS using various data sets like shoping data set etclalithasri22
 
6 Tips for Interpretable Topic Models _ by Nicha Ruchirawat _ Towards Data Sc...
6 Tips for Interpretable Topic Models _ by Nicha Ruchirawat _ Towards Data Sc...6 Tips for Interpretable Topic Models _ by Nicha Ruchirawat _ Towards Data Sc...
6 Tips for Interpretable Topic Models _ by Nicha Ruchirawat _ Towards Data Sc...Dr Arash Najmaei ( Phd., MBA, BSc)
 
Digital Marketing Plan, how digital marketing works
Digital Marketing Plan, how digital marketing worksDigital Marketing Plan, how digital marketing works
Digital Marketing Plan, how digital marketing worksdeepakthakur548787
 
why-transparency-and-traceability-are-essential-for-sustainable-supply-chains...
why-transparency-and-traceability-are-essential-for-sustainable-supply-chains...why-transparency-and-traceability-are-essential-for-sustainable-supply-chains...
why-transparency-and-traceability-are-essential-for-sustainable-supply-chains...Jack Cole
 
Digital Indonesia Report 2024 by We Are Social .pdf
Digital Indonesia Report 2024 by We Are Social .pdfDigital Indonesia Report 2024 by We Are Social .pdf
Digital Indonesia Report 2024 by We Are Social .pdfNicoChristianSunaryo
 
Data Analysis Project Presentation: Unveiling Your Ideal Customer, Bank Custo...
Data Analysis Project Presentation: Unveiling Your Ideal Customer, Bank Custo...Data Analysis Project Presentation: Unveiling Your Ideal Customer, Bank Custo...
Data Analysis Project Presentation: Unveiling Your Ideal Customer, Bank Custo...Boston Institute of Analytics
 
Presentation of project of business person who are success
Presentation of project of business person who are successPresentation of project of business person who are success
Presentation of project of business person who are successPratikSingh115843
 
English-8-Q4-W3-Synthesizing-Essential-Information-From-Various-Sources-1.pdf
English-8-Q4-W3-Synthesizing-Essential-Information-From-Various-Sources-1.pdfEnglish-8-Q4-W3-Synthesizing-Essential-Information-From-Various-Sources-1.pdf
English-8-Q4-W3-Synthesizing-Essential-Information-From-Various-Sources-1.pdfblazblazml
 
Decoding Movie Sentiments: Analyzing Reviews with Data Analysis model
Decoding Movie Sentiments: Analyzing Reviews with Data Analysis modelDecoding Movie Sentiments: Analyzing Reviews with Data Analysis model
Decoding Movie Sentiments: Analyzing Reviews with Data Analysis modelBoston Institute of Analytics
 
Bank Loan Approval Analysis: A Comprehensive Data Analysis Project
Bank Loan Approval Analysis: A Comprehensive Data Analysis ProjectBank Loan Approval Analysis: A Comprehensive Data Analysis Project
Bank Loan Approval Analysis: A Comprehensive Data Analysis ProjectBoston Institute of Analytics
 
Role of Consumer Insights in business transformation
Role of Consumer Insights in business transformationRole of Consumer Insights in business transformation
Role of Consumer Insights in business transformationAnnie Melnic
 
Statistics For Management by Richard I. Levin 8ed.pdf
Statistics For Management by Richard I. Levin 8ed.pdfStatistics For Management by Richard I. Levin 8ed.pdf
Statistics For Management by Richard I. Levin 8ed.pdfnikeshsingh56
 
IBEF report on the Insurance market in India
IBEF report on the Insurance market in IndiaIBEF report on the Insurance market in India
IBEF report on the Insurance market in IndiaManalVerma4
 
Non Text Magic Studio Magic Design for Presentations L&P.pdf
Non Text Magic Studio Magic Design for Presentations L&P.pdfNon Text Magic Studio Magic Design for Presentations L&P.pdf
Non Text Magic Studio Magic Design for Presentations L&P.pdfPratikPatil591646
 

Recently uploaded (17)

DATA ANALYSIS using various data sets like shoping data set etc
DATA ANALYSIS using various data sets like shoping data set etcDATA ANALYSIS using various data sets like shoping data set etc
DATA ANALYSIS using various data sets like shoping data set etc
 
6 Tips for Interpretable Topic Models _ by Nicha Ruchirawat _ Towards Data Sc...
6 Tips for Interpretable Topic Models _ by Nicha Ruchirawat _ Towards Data Sc...6 Tips for Interpretable Topic Models _ by Nicha Ruchirawat _ Towards Data Sc...
6 Tips for Interpretable Topic Models _ by Nicha Ruchirawat _ Towards Data Sc...
 
2023 Survey Shows Dip in High School E-Cigarette Use
2023 Survey Shows Dip in High School E-Cigarette Use2023 Survey Shows Dip in High School E-Cigarette Use
2023 Survey Shows Dip in High School E-Cigarette Use
 
Digital Marketing Plan, how digital marketing works
Digital Marketing Plan, how digital marketing worksDigital Marketing Plan, how digital marketing works
Digital Marketing Plan, how digital marketing works
 
why-transparency-and-traceability-are-essential-for-sustainable-supply-chains...
why-transparency-and-traceability-are-essential-for-sustainable-supply-chains...why-transparency-and-traceability-are-essential-for-sustainable-supply-chains...
why-transparency-and-traceability-are-essential-for-sustainable-supply-chains...
 
Insurance Churn Prediction Data Analysis Project
Insurance Churn Prediction Data Analysis ProjectInsurance Churn Prediction Data Analysis Project
Insurance Churn Prediction Data Analysis Project
 
Digital Indonesia Report 2024 by We Are Social .pdf
Digital Indonesia Report 2024 by We Are Social .pdfDigital Indonesia Report 2024 by We Are Social .pdf
Digital Indonesia Report 2024 by We Are Social .pdf
 
Data Analysis Project Presentation: Unveiling Your Ideal Customer, Bank Custo...
Data Analysis Project Presentation: Unveiling Your Ideal Customer, Bank Custo...Data Analysis Project Presentation: Unveiling Your Ideal Customer, Bank Custo...
Data Analysis Project Presentation: Unveiling Your Ideal Customer, Bank Custo...
 
Presentation of project of business person who are success
Presentation of project of business person who are successPresentation of project of business person who are success
Presentation of project of business person who are success
 
English-8-Q4-W3-Synthesizing-Essential-Information-From-Various-Sources-1.pdf
English-8-Q4-W3-Synthesizing-Essential-Information-From-Various-Sources-1.pdfEnglish-8-Q4-W3-Synthesizing-Essential-Information-From-Various-Sources-1.pdf
English-8-Q4-W3-Synthesizing-Essential-Information-From-Various-Sources-1.pdf
 
Decoding Movie Sentiments: Analyzing Reviews with Data Analysis model
Decoding Movie Sentiments: Analyzing Reviews with Data Analysis modelDecoding Movie Sentiments: Analyzing Reviews with Data Analysis model
Decoding Movie Sentiments: Analyzing Reviews with Data Analysis model
 
Bank Loan Approval Analysis: A Comprehensive Data Analysis Project
Bank Loan Approval Analysis: A Comprehensive Data Analysis ProjectBank Loan Approval Analysis: A Comprehensive Data Analysis Project
Bank Loan Approval Analysis: A Comprehensive Data Analysis Project
 
Role of Consumer Insights in business transformation
Role of Consumer Insights in business transformationRole of Consumer Insights in business transformation
Role of Consumer Insights in business transformation
 
Statistics For Management by Richard I. Levin 8ed.pdf
Statistics For Management by Richard I. Levin 8ed.pdfStatistics For Management by Richard I. Levin 8ed.pdf
Statistics For Management by Richard I. Levin 8ed.pdf
 
Data Analysis Project: Stroke Prediction
Data Analysis Project: Stroke PredictionData Analysis Project: Stroke Prediction
Data Analysis Project: Stroke Prediction
 
IBEF report on the Insurance market in India
IBEF report on the Insurance market in IndiaIBEF report on the Insurance market in India
IBEF report on the Insurance market in India
 
Non Text Magic Studio Magic Design for Presentations L&P.pdf
Non Text Magic Studio Magic Design for Presentations L&P.pdfNon Text Magic Studio Magic Design for Presentations L&P.pdf
Non Text Magic Studio Magic Design for Presentations L&P.pdf
 

{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

  • 1.
  • 2. > me $name [1] "Takashi Kitano" $twitter [1] "@kashitan" $work_in [1] " "
  • 4.
  • 6.
  • 7. > head(talks_en) # A tibble: 6 x 2 title_en transcript_en <chr> <chr> 1 Fake videos of real people — and how to … Look at these images. Now… 2 How to build synthetic DNA and send it a… "Alright, let me tell you… 3 Technology that knows what you're feeling "What happens when techno… 4 How to get empowered, not overpowered, b… "After 13.8 billion years… 5 Where joy hides and how to find it "It's 2008, and I'm just … 6 ""You Found Me"" (Cello music starts) You …
  • 8. > talks_en %>% tidytext::unnest_tokens(word, transcript_en)
  • 9. > talks_en %>% tidytext::unnest_tokens(word, transcript_en) # A tibble: 9,840 x 2 title_en word <chr> <chr> 1 Fake videos of real people — and how to spot them look 2 Fake videos of real people — and how to spot them at 3 Fake videos of real people — and how to spot them these 4 Fake videos of real people — and how to spot them images 5 Fake videos of real people — and how to spot them now 6 Fake videos of real people — and how to spot them tell 7 Fake videos of real people — and how to spot them me 8 Fake videos of real people — and how to spot them which 9 Fake videos of real people — and how to spot them obama 10 Fake videos of real people — and how to spot them here
  • 10.
  • 11. > talks_ja %>% head() # A tibble: 6 x 2 title_ja transcript_ja <chr> <chr> 1 … … 2 DNA … … 3 … 4 AI AI … 138 … 5 2008 … 6 You Found Me … …
  • 12. > talks_ja %>% tidytext::unnest_tokens(word, transcript_ja) # A tibble: 6,266,182 x 2 title_ja word <chr> <chr> 1 2 3 4 5 6 # ... with 6,266,172 more rows
  • 14. > talks_ja %>% tidytext::unnest_tokens(word, transcript_ja) # A tibble: 6,266,182 x 2 title_ja word <chr> <chr> 1 2 3 4 5 6 # ... with 6,266,172 more rows
  • 15.
  • 16.
  • 17.
  • 18. > mecab_result <- talks_ja %>% + RMeCabDF("transcript_ja", 1) > glimpse(mecab_result) List of 1 $ : Named chr [1:1445] " " " " " " " " ... ..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
  • 19. > mecab_result <- talks_ja %>% + as.data.frame() %>% + RMeCabDF("transcript_ja", 1) > glimpse(mecab_result) List of 2551 $ : Named chr [1:1445] " " " " " " " " ... ..- attr(*, "names")= chr [1:1445] " " " " " " " " ... $ : Named chr [1:2903] " " " " " " " " ... ..- attr(*, "names")= chr [1:2903] " " " " " " " " ... $ : Named chr [1:2208] " " " " " " " " ... ..- attr(*, "names")= chr [1:2208] " " " " " " " " ...
  • 20. > # tibble 1 tibble > class(talks_ja[, "transcript_ja"]) [1] "tbl_df" "tbl" "data.frame" > # 1 > length(talks_ja[, "transcript_ja"]) [1] 1 > # data.frame 1 > class(as.data.frame(talks_ja)[, "transcript_ja"]) [1] "character" > # > length(as.data.frame(talks_ja)[, "transcript_ja"]) [1] 2551
  • 21. > tokens_ja <- purrr::pmap_df(list(nv = mecab_result, + title = talks_ja$title_ja), + function(nv, title){ + tibble(title = title, + word = nv, + hinshi = names(nv)) + })
  • 22. > tokens_ja # A tibble: 6,483,469 x 3 title word hinshi <chr> <chr> <chr> 1 2 3 4 5 6 # ... with 6,483,463 more rows
  • 23.
  • 24. > bigram_en <- talks_en %>% select(title_en, transcript_en) %>% + tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams", n = 2) > head(bigram_en) # A tibble: 6 x 2 title_en bigram <chr> <chr> 1 ""(Nothing But) Flowers" with string quartet" music here 2 ""(Nothing But) Flowers" with string quartet" here we 3 ""(Nothing But) Flowers" with string quartet" we stand 4 ""(Nothing But) Flowers" with string quartet" stand like 5 ""(Nothing But) Flowers" with string quartet" like an 6 ""(Nothing But) Flowers" with string quartet" an adam
  • 25.
  • 26. > bigram_ja <- talks_ja %>% + as.data.frame() %>% + docDF(col = "transcript_ja", type=1, N = 2) number of extracted terms = 898167 now making a data frame. wait a while!
  • 27. > bigram_ja.bk %>% + select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7, Row8, Row9) %>% + head() TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9 1 !-( - 0 0 0 0 0 0 0 0 0 2 !-7 - 0 0 0 0 0 0 0 0 0 3 !-Google - 0 0 0 0 0 0 0 0 0 4 !-Little - 0 0 0 0 0 0 0 0 0 5 !-Time - 0 0 0 0 0 0 0 0 0 6 !-Toonchi - 0 0 0 0 0 0 0 0 0
  • 28.
  • 29. > bigram_ja <- tokens_ja %>% + group_by(title) %>% + rename(word1 = word, + hinshi1 = hinshi) %>% + mutate(word2 = lead(word1), + hinshi2 = lead(hinshi1)) %>% + ungroup() %>% + filter(!is.na(word2)) %>% + select(title, word1, word2, hinshi1, hinshi2)
  • 30. > bigram_ja # A tibble: 6,480,920 x 5 title word1 word2 hinshi1 hinshi2 <chr> <chr> <chr> <chr> <chr> 1 2 3 4 5 6 7 8 # ... with 6,480,912 more rows
  • 31.