spark实例5:找出词频最高的前K个词
输入数据:
Hello World Bye World
Hello Hadoop Bye Hadoop
Bye Hadoop Hello Hadoop
输出结果:
(Hadoop,4)
(Bye,3)
(Hello,3)
(World,2)
/**
* 一是统计词频,二是找出词频最高的前K个词
*/
object TopNBasic {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName(TopNBasic.getClass.getSimpleName)
conf.setMaster("local")
val sc = new SparkContext(conf)
val textFile = sc.textFile("E:\\newcode\\MyFirstProject\\data\\topn.txt")
val words = textFile.flatMap(line => line.split(" "))
val wordPairs = words.map(word => (word, 1))
val wordCounts = wordPairs.reduceByKey((a,b) => a + b)
// val groupRDD = textFile.map(line => (line.split(" ")(0),line.split(" ")(1).toInt)).groupByKey()
// val top2 = groupRDD.map(pair => (pair._1, pair._2.toList.sortWith(_>_).take(2)))
val top2 = wordCounts.sortBy(_._2, false).take(4)
println("wordCounts: ")
top2.foreach(println)
}
}
