spark-WordCount 源码分析图解

1. maven依赖

org.apache.spark

spark-core_2.11

2.3.4

io.netty

netty

io.netty

netty-all

4.1.18.Final

2. scala代码

package spark

import org.apache.spark.{SparkConf, SparkContext}

object WordCount {

def main(args: Array[String]): Unit = {

val conf = new SparkConf()

// 设置spark程序的运行名称

conf.setAppName("WordCount")

// 设置spark是本地运行还是集群运行

conf.setMaster("local")

val sc = new SparkContext(conf)

// 单词统计

val fileRDD = sc.textFile("data/word")

// 数据是: hello word

// val words = fileRDD.flatMap(_.split(" ")) 简写

val words = fileRDD.flatMap((x: String) => {

x.split(" ")

})

//数据 :

// hello

// word

// 然后再转成tuple键值对 hello 1 ,word 1

// val pariWord = words.map(new Tuple2(_,1))

val pariWord = words.map((x: String) => {

new Tuple2(x, 1)

})

// 然后转成 hello 2, word 1,Angzush 2

val res = pariWord.reduceByKey((x: Int, y: Int) => {

x + y

})

// 如果我想转为类似hello 拥有两次字符的有几个

// 那res的结果进行map

val fanzhuan = res.map((x) => {

(x._2, 1)

})

// 然后再进行reduceByKey

val value = fanzhuan.reduceByKey(_ + _)

value.foreach(println)

res.foreach(println)

// 为了进入localhost:4040而休眠

Thread.sleep(Long.MaxValue)

}

}

图解:

精彩文章

评论可见,请评论后查看内容,谢谢!!!
 您阅读本篇文章共花了: