SlideShare une entreprise Scribd logo
1  sur  76
RHadoop, Hadoop for R
Scholarly Activity 05-09 change

  50%

37.5%

  25%

12.5%

   0%

-12.5%

 -25%

-37.5%
         R        SAS       SPSS     S-Plus    Stata
Scholarly Activity 05-09 change

                                                 50%

                                               37.5%

                                                 25%

                                               12.5%
                      Packages
                                                  0%

10000                                          -12.5%

                                                -25%
 1000                                          -37.5%
                                                        R        SAS       SPSS     S-Plus    Stata

  100


   10


    1
        2002   2004   2006       2008   2010
Scholarly Activity 05-09 change

                                                 50%

                                               37.5%

                                                 25%

                                               12.5%
                      Packages
                                                  0%

10000                                          -12.5%

                                                -25%
 1000                                          -37.5%
                                                        R        SAS       SPSS     S-Plus    Stata

  100


   10
                                                    http://r4stats.com/popularity
    1
        2002   2004   2006       2008   2010
David Champagne, CTO
f s
    h d
r
rh d f s


rhb
      ase
rh d f s


rhb
      ase



      rm
  r
rmr
sapply(data, function)

mapreduce(data, map = function)
library(rmr)

mapreduce(…)
Rmr
Rmr




      Java, C++
Rmr




                  Cascading,
      Java, C++
                   Crunch
Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                     Cascading,
        Java, C++
                      Crunch
Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                     Cascading,
        Java, C++
                      Crunch
Expose MR   Hide MR




Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                               Cascading,
        Java, C++
                                Crunch
Expose MR   Hide MR
                               Hive, Pig




Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                               Cascading,
        Java, C++
                                Crunch
Expose MR   Hide MR
                               Hive, Pig




Rmr, Rhipe, Dumbo,
Rmr                            Cascalog,
 Pydoop, Hadoopy           Scalding, Scrunch




                               Cascading,
        Java, C++
                                Crunch
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
condition = function(x) x > 10
condition = function(x) x > 10


out = mapreduce(
condition = function(x) x > 10


out = mapreduce(
        input = input,
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
                 if (condition(v)) keyval(k,v))
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
                 if (condition(v)) keyval(k,v))
x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;

mapreduce(input =
  mapreduce(input = "pv_users",
    map = function(k, v) keyval(v['userid'], v['gender']),
    reduce = function(uid, genders)
      lapply(unique(genders), function(g) keyval(NULL, g)),
  output = "pv_gender_sum",
  map = function(x, gender) keyval(gender, 1)
  reduce = function(gender,counts)
             keyval(k,sum(unlist(counts)))
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig

filename = "student.txt"
k = 4
tolerance = 0.01

MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
    for i in range(k):
        tuple = iter.next()
        centroids[i] = float(str(tuple.get(1)))
        distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
    distance_move = distance_move / k;
    Pig.fs("rmr output")
    print("iteration " + str(iter_num))
    print("average distance moved: " + str(distance_move))
    if distance_move<tolerance:
        sys.stdout.write("k-means converged at centroids: [")
        sys.stdout.write(",".join(str(v) for v in centroids))
        sys.stdout.write("]n")
        converged = True
        break
    last_centroids = centroids[:]
    initial_centroids = ""
    for i in range(k):
        initial_centroids = initial_centroids + str(last_centroids[i])
        if i!=k-1:
            initial_centroids = initial_centroids + ":"
    iter_num += 1

if not converged:
    print("not converge after " + str(iter_num) + " iterations")
    sys.stdout.write("last centroids: [")
    sys.stdout.write(",".join(str(v) for v in last_centroids))
    sys.stdout.write("]n")
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;


public class FindCentroid extends EvalFunc<Double> {
    double[] centroids;
    public FindCentroid(String initialCentroid) {
        String[] centroidStrings = initialCentroid.split(":");
        centroids = new double[centroidStrings.length];
        for (int i=0;i<centroidStrings.length;i++)
            centroids[i] = Double.parseDouble(centroidStrings[i]);
    }
    @Override
    public Double exec(Tuple input) throws IOException {
        double min_distance = Double.MAX_VALUE;
        double closest_centroid = 0;
        for (double centroid : centroids) {
            double distance = Math.abs(centroid - (Double)input.get(0));
            if (distance < min_distance) {
                min_distance = distance;
                closest_centroid = centroid;
            }
        }
        return closest_centroid;
    }

}
mapreduce(mapreduce(…
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin = function(
    left.input, right.input, input,
    output,
    outer,
    map.left, map.right,
    reduce, reduce.all)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) {
   …
   result = mapreduce(input = input,
                      output = output)
   …
   result}
input.format, output.format, format
input.format, output.format, format
combine
input.format, output.format, format
combine
reduce.on.data.frame
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
profiling
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
profiling
verbose
RHADOOP USER
ONE FAT CLUSTER AVE.
HYDROPOWER CITY, OR 0x0000




             RHADOOP@
      REVOLUTIONANALYTICS.COM

Contenu connexe

Tendances

Stata cheat sheet analysis
Stata cheat sheet analysisStata cheat sheet analysis
Stata cheat sheet analysisTim Essam
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptxNimrita Koul
 
Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on rAbhik Seal
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processingTim Essam
 
Building a Functional Stream in Scala
Building a Functional Stream in ScalaBuilding a Functional Stream in Scala
Building a Functional Stream in ScalaDerek Wyatt
 
ComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical SciencesComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical Sciencesalexstorer
 
R + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterR + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterJeffrey Breen
 
Phil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonPhil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonRoss McDonald
 
Morel, a Functional Query Language
Morel, a Functional Query LanguageMorel, a Functional Query Language
Morel, a Functional Query LanguageJulian Hyde
 
RHive tutorials - Basic functions
RHive tutorials - Basic functionsRHive tutorials - Basic functions
RHive tutorials - Basic functionsAiden Seonghak Hong
 
Python for R Users
Python for R UsersPython for R Users
Python for R UsersAjay Ohri
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformationTim Essam
 
Upgrading To The New Map Reduce API
Upgrading To The New Map Reduce APIUpgrading To The New Map Reduce API
Upgrading To The New Map Reduce APITom Croucher
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformationLaura Hughes
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat SheetLaura Hughes
 
Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章moai kids
 

Tendances (20)

Stata cheat sheet analysis
Stata cheat sheet analysisStata cheat sheet analysis
Stata cheat sheet analysis
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptx
 
Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on r
 
Polimorfismo cosa?
Polimorfismo cosa?Polimorfismo cosa?
Polimorfismo cosa?
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processing
 
Building a Functional Stream in Scala
Building a Functional Stream in ScalaBuilding a Functional Stream in Scala
Building a Functional Stream in Scala
 
ComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical SciencesComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical Sciences
 
R + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterR + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop cluster
 
Phil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonPhil Bartie QGIS PLPython
Phil Bartie QGIS PLPython
 
Morel, a Functional Query Language
Morel, a Functional Query LanguageMorel, a Functional Query Language
Morel, a Functional Query Language
 
RHive tutorials - Basic functions
RHive tutorials - Basic functionsRHive tutorials - Basic functions
RHive tutorials - Basic functions
 
Python for R Users
Python for R UsersPython for R Users
Python for R Users
 
R seminar dplyr package
R seminar dplyr packageR seminar dplyr package
R seminar dplyr package
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
 
Upgrading To The New Map Reduce API
Upgrading To The New Map Reduce APIUpgrading To The New Map Reduce API
Upgrading To The New Map Reduce API
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat Sheet
 
Dplyr and Plyr
Dplyr and PlyrDplyr and Plyr
Dplyr and Plyr
 
Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章
 
Python for R users
Python for R usersPython for R users
Python for R users
 

Similaire à RHadoop for R: Run R Code on Hadoop

Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Craig Chao
 
Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%David LeBauer
 
Monadologie
MonadologieMonadologie
Monadologieleague
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik ErlandsonDatabricks
 
Ruby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupRuby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupBrian O'Neill
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions Dr. Volkan OBAN
 
Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Filippo Vitale
 
support vector regression
support vector regressionsupport vector regression
support vector regressionAkhilesh Joshi
 
Introduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptIntroduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptWill Kurt
 
Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Brian O'Neill
 
Getting Functional with Scala
Getting Functional with ScalaGetting Functional with Scala
Getting Functional with ScalaJorge Paez
 

Similaire à RHadoop for R: Run R Code on Hadoop (20)

RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
 
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
 
Jan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoopJan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoop
 
Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Monadologie
MonadologieMonadologie
Monadologie
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson
 
Ruby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupRuby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby Group
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 
Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015
 
Clojure to Slang
Clojure to SlangClojure to Slang
Clojure to Slang
 
Hadoop I/O Analysis
Hadoop I/O AnalysisHadoop I/O Analysis
Hadoop I/O Analysis
 
support vector regression
support vector regressionsupport vector regression
support vector regression
 
Feature Extraction
Feature ExtractionFeature Extraction
Feature Extraction
 
Introduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptIntroduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScript
 
Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)
 
Getting Functional with Scala
Getting Functional with ScalaGetting Functional with Scala
Getting Functional with Scala
 

Plus de Revolution Analytics

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudRevolution Analytics
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureRevolution Analytics
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudRevolution Analytics
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondRevolution Analytics
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source CommunitiesRevolution Analytics
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with RRevolution Analytics
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceRevolution Analytics
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudRevolution Analytics
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorRevolution Analytics
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalRevolution Analytics
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint packageRevolution Analytics
 

Plus de Revolution Analytics (20)

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the Cloud
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to Azure
 
R in Minecraft
R in Minecraft R in Minecraft
R in Minecraft
 
The case for R for AI developers
The case for R for AI developersThe case for R for AI developers
The case for R for AI developers
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the Cloud
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R Then and Now
R Then and NowR Then and Now
R Then and Now
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per Second
 
Reproducible Data Science with R
Reproducible Data Science with RReproducible Data Science with R
Reproducible Data Science with R
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source Communities
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with R
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data Science
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the Cloud
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductor
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 final
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint package
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 

Dernier

Long journey of Ruby standard library at RubyConf AU 2024
Long journey of Ruby standard library at RubyConf AU 2024Long journey of Ruby standard library at RubyConf AU 2024
Long journey of Ruby standard library at RubyConf AU 2024Hiroshi SHIBATA
 
TrustArc Webinar - How to Build Consumer Trust Through Data Privacy
TrustArc Webinar - How to Build Consumer Trust Through Data PrivacyTrustArc Webinar - How to Build Consumer Trust Through Data Privacy
TrustArc Webinar - How to Build Consumer Trust Through Data PrivacyTrustArc
 
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesHow to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesThousandEyes
 
Potential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsPotential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsRavi Sanghani
 
Time Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsTime Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsNathaniel Shimoni
 
MuleSoft Online Meetup Group - B2B Crash Course: Release SparkNotes
MuleSoft Online Meetup Group - B2B Crash Course: Release SparkNotesMuleSoft Online Meetup Group - B2B Crash Course: Release SparkNotes
MuleSoft Online Meetup Group - B2B Crash Course: Release SparkNotesManik S Magar
 
Genislab builds better products and faster go-to-market with Lean project man...
Genislab builds better products and faster go-to-market with Lean project man...Genislab builds better products and faster go-to-market with Lean project man...
Genislab builds better products and faster go-to-market with Lean project man...Farhan Tariq
 
QCon London: Mastering long-running processes in modern architectures
QCon London: Mastering long-running processes in modern architecturesQCon London: Mastering long-running processes in modern architectures
QCon London: Mastering long-running processes in modern architecturesBernd Ruecker
 
Generative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfGenerative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfIngrid Airi González
 
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...Nikki Chapple
 
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxMerck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxLoriGlavin3
 
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...Alkin Tezuysal
 
Emixa Mendix Meetup 11 April 2024 about Mendix Native development
Emixa Mendix Meetup 11 April 2024 about Mendix Native developmentEmixa Mendix Meetup 11 April 2024 about Mendix Native development
Emixa Mendix Meetup 11 April 2024 about Mendix Native developmentPim van der Noll
 
Abdul Kader Baba- Managing Cybersecurity Risks and Compliance Requirements i...
Abdul Kader Baba- Managing Cybersecurity Risks  and Compliance Requirements i...Abdul Kader Baba- Managing Cybersecurity Risks  and Compliance Requirements i...
Abdul Kader Baba- Managing Cybersecurity Risks and Compliance Requirements i...itnewsafrica
 
Varsha Sewlal- Cyber Attacks on Critical Critical Infrastructure
Varsha Sewlal- Cyber Attacks on Critical Critical InfrastructureVarsha Sewlal- Cyber Attacks on Critical Critical Infrastructure
Varsha Sewlal- Cyber Attacks on Critical Critical Infrastructureitnewsafrica
 
Zeshan Sattar- Assessing the skill requirements and industry expectations for...
Zeshan Sattar- Assessing the skill requirements and industry expectations for...Zeshan Sattar- Assessing the skill requirements and industry expectations for...
Zeshan Sattar- Assessing the skill requirements and industry expectations for...itnewsafrica
 
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfSo einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfpanagenda
 
The State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxThe State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxLoriGlavin3
 
Decarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityDecarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityIES VE
 
Top 10 Hubspot Development Companies in 2024
Top 10 Hubspot Development Companies in 2024Top 10 Hubspot Development Companies in 2024
Top 10 Hubspot Development Companies in 2024TopCSSGallery
 

Dernier (20)

Long journey of Ruby standard library at RubyConf AU 2024
Long journey of Ruby standard library at RubyConf AU 2024Long journey of Ruby standard library at RubyConf AU 2024
Long journey of Ruby standard library at RubyConf AU 2024
 
TrustArc Webinar - How to Build Consumer Trust Through Data Privacy
TrustArc Webinar - How to Build Consumer Trust Through Data PrivacyTrustArc Webinar - How to Build Consumer Trust Through Data Privacy
TrustArc Webinar - How to Build Consumer Trust Through Data Privacy
 
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesHow to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
 
Potential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsPotential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and Insights
 
Time Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsTime Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directions
 
MuleSoft Online Meetup Group - B2B Crash Course: Release SparkNotes
MuleSoft Online Meetup Group - B2B Crash Course: Release SparkNotesMuleSoft Online Meetup Group - B2B Crash Course: Release SparkNotes
MuleSoft Online Meetup Group - B2B Crash Course: Release SparkNotes
 
Genislab builds better products and faster go-to-market with Lean project man...
Genislab builds better products and faster go-to-market with Lean project man...Genislab builds better products and faster go-to-market with Lean project man...
Genislab builds better products and faster go-to-market with Lean project man...
 
QCon London: Mastering long-running processes in modern architectures
QCon London: Mastering long-running processes in modern architecturesQCon London: Mastering long-running processes in modern architectures
QCon London: Mastering long-running processes in modern architectures
 
Generative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfGenerative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdf
 
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
 
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxMerck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
 
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
 
Emixa Mendix Meetup 11 April 2024 about Mendix Native development
Emixa Mendix Meetup 11 April 2024 about Mendix Native developmentEmixa Mendix Meetup 11 April 2024 about Mendix Native development
Emixa Mendix Meetup 11 April 2024 about Mendix Native development
 
Abdul Kader Baba- Managing Cybersecurity Risks and Compliance Requirements i...
Abdul Kader Baba- Managing Cybersecurity Risks  and Compliance Requirements i...Abdul Kader Baba- Managing Cybersecurity Risks  and Compliance Requirements i...
Abdul Kader Baba- Managing Cybersecurity Risks and Compliance Requirements i...
 
Varsha Sewlal- Cyber Attacks on Critical Critical Infrastructure
Varsha Sewlal- Cyber Attacks on Critical Critical InfrastructureVarsha Sewlal- Cyber Attacks on Critical Critical Infrastructure
Varsha Sewlal- Cyber Attacks on Critical Critical Infrastructure
 
Zeshan Sattar- Assessing the skill requirements and industry expectations for...
Zeshan Sattar- Assessing the skill requirements and industry expectations for...Zeshan Sattar- Assessing the skill requirements and industry expectations for...
Zeshan Sattar- Assessing the skill requirements and industry expectations for...
 
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfSo einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
 
The State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxThe State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptx
 
Decarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityDecarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a reality
 
Top 10 Hubspot Development Companies in 2024
Top 10 Hubspot Development Companies in 2024Top 10 Hubspot Development Companies in 2024
Top 10 Hubspot Development Companies in 2024
 

RHadoop for R: Run R Code on Hadoop

  • 2.
  • 3. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% 0% -12.5% -25% -37.5% R SAS SPSS S-Plus Stata
  • 4. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% Packages 0% 10000 -12.5% -25% 1000 -37.5% R SAS SPSS S-Plus Stata 100 10 1 2002 2004 2006 2008 2010
  • 5. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% Packages 0% 10000 -12.5% -25% 1000 -37.5% R SAS SPSS S-Plus Stata 100 10 http://r4stats.com/popularity 1 2002 2004 2006 2008 2010
  • 6.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14. f s h d r
  • 15. rh d f s rhb ase
  • 16. rh d f s rhb ase rm r
  • 17. rmr
  • 18.
  • 19.
  • 22. Rmr
  • 23. Rmr Java, C++
  • 24. Rmr Cascading, Java, C++ Crunch
  • 25. Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 26. Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 27. Expose MR Hide MR Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 28. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 29. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Dumbo, Rmr Cascalog, Pydoop, Hadoopy Scalding, Scrunch Cascading, Java, C++ Crunch
  • 36. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 37. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 38. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 39. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 40. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 41. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 42. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 43. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 45. condition = function(x) x > 10 out = mapreduce(
  • 46. condition = function(x) x > 10 out = mapreduce( input = input,
  • 47. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v)
  • 48. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 49. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 51. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender;
  • 52. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender; mapreduce(input = mapreduce(input = "pv_users", map = function(k, v) keyval(v['userid'], v['gender']), reduce = function(uid, genders) lapply(unique(genders), function(g) keyval(NULL, g)), output = "pv_gender_sum", map = function(x, gender) keyval(gender, 1) reduce = function(gender,counts) keyval(k,sum(unlist(counts)))
  • 53. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters}
  • 54. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 55. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 56. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 57. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 58. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k = 4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
  • 59. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 60. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 63. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
  • 64. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>)
  • 65. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 71. input.format, output.format, format combine reduce.on.data.frame local, hadoop backends backend.parameters profiling
  • 72. input.format, output.format, format combine reduce.on.data.frame local, hadoop backends backend.parameters profiling verbose
  • 73.
  • 74.
  • 75.
  • 76. RHADOOP USER ONE FAT CLUSTER AVE. HYDROPOWER CITY, OR 0x0000 RHADOOP@ REVOLUTIONANALYTICS.COM

Notes de l'éditeur

  1. What is R\nWhat is RHadoop\nOpen source project\nstarted by RevoLution\naims to make R and Hadoop work together\nwhat is revolution\n
  2. \n
  3. \n
  4. \n
  5. Faster, assured builds\nLarge Data extensions\nWeb deployments\nTech support\nConsulting service\nTraining\n
  6. \n
  7. hadoop bring horizontal scalability\nr sophisticated analytics\ncombination could be powerful\n
  8. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  9. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  10. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  11. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  12. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  13. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  14. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  15. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  16. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  17. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  18. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  19. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  20. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  21. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  22. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  23. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  24. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  25. A way to access big data sets\n\n
  26. A simple way to write parallel programs &amp;#x2013; everyone will have to\n \n\n
  27. Very R-like, building on the functional characteristics of R\n\n
  28. Just a library&amp;#xA0;\n
  29. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  30. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  31. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  32. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  33. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  34. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  35. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  36. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  37. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  38. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  39. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  40. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  41. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  42. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  43. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  44. simple map example -- filtering\nreduce example -- counting\n
  45. simple map example -- filtering\nreduce example -- counting\n
  46. simple map example -- filtering\nreduce example -- counting\n
  47. simple map example -- filtering\nreduce example -- counting\n
  48. simple map example -- filtering\nreduce example -- counting\n
  49. simple map example -- filtering\nreduce example -- counting\n
  50. simple map example -- filtering\nreduce example -- counting\n
  51. simple map example -- filtering\nreduce example -- counting\n
  52. simple map example -- filtering\nreduce example -- counting\n
  53. simple map example -- filtering\nreduce example -- counting\n
  54. simple map example -- filtering\nreduce example -- counting\n
  55. simple map example -- filtering\nreduce example -- counting\n
  56. simple map example -- filtering\nreduce example -- counting\n
  57. easy to parametrize jobs\n
  58. easy to parametrize jobs\n
  59. easy to parametrize jobs\n
  60. easy to parametrize jobs\n
  61. easy to parametrize jobs\n
  62. easy to parametrize jobs\n
  63. second pillar of API, the memory-hdfs bridge\n
  64. A language like HIVE makes a class of problems easy to solve, but it is not a general tool\n The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities\n
  65. A language like HIVE makes a class of problems easy to solve, but it is not a general tool\n The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities\n
  66. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  67. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  68. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  69. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  70. skip quickly to other slides\nnotice three different languages\n
  71. \n
  72. \n
  73. more things you can do combining the elements of the API\n
  74. more things you can do combining the elements of the API\n
  75. more things you can do combining the elements of the API\n
  76. \n
  77. \n
  78. \n
  79. \n
  80. \n
  81. \n
  82. \n
  83. \n
  84. \n
  85. \n
  86. \n
  87. \n
  88. \n