spark实例5:找出词频最高的前K个词
输入数据:
Hello World Bye World
Hello Hadoop Bye Hadoop
Bye Hadoop Hello Hadoop
输出结果:
(Hadoop,4)
(Bye,3)
(Hello,3)
(World,2)
/** * 一是统计词频,二是找出词频最高的前K个词 */ object TopNBasic { def main(args: Array[String]) { val conf = new SparkConf().setAppName(TopNBasic.getClass.getSimpleName) conf.setMaster("local") val sc = new SparkContext(conf) val textFile = sc.textFile("E:\\newcode\\MyFirstProject\\data\\topn.txt") val words = textFile.flatMap(line => line.split(" ")) val wordPairs = words.map(word => (word, 1)) val wordCounts = wordPairs.reduceByKey((a,b) => a + b) // val groupRDD = textFile.map(line => (line.split(" ")(0),line.split(" ")(1).toInt)).groupByKey() // val top2 = groupRDD.map(pair => (pair._1, pair._2.toList.sortWith(_>_).take(2))) val top2 = wordCounts.sortBy(_._2, false).take(4) println("wordCounts: ") top2.foreach(println) } }