east – 第84页 – gitweixin

运维 12月 17,2018

hadoop实例4:分析网站日志

public class LogCleanJob extends Configured implements Tool {

public static void main(String[] args) {
Configuration conf = new Configuration();
try {
int res = ToolRunner.run(conf, new LogCleanJob(), args);
System.exit(res);
} catch (Exception e) {
e.printStackTrace();
}
}

@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
//设置mapper的配置，既就是hadoop/conf/mapred-site.xml的配置信息
conf.set(“mapred.job.tracker”, “hadoop01:9001”);
final Job job = new Job(new Configuration(),
LogCleanJob.class.getSimpleName());
// 设置为可以打包运行
job.setJarByClass(LogCleanJob.class);
// FileInputFormat.setInputPaths(job, args[0]);
FileInputFormat.addInputPath(job, new Path(“hdfs://localhost:9000/user/logclean”));
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
/*
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 清理已存在的输出文件
FileSystem fs = FileSystem.get(new URI(args[0]), getConf());
Path outPath = new Path(args[1]);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}*/

Path outputPath = new Path(“hdfs://localhost:9000/user/logcleanOutput”);
FileSystem fs = outputPath.getFileSystem(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}

FileOutputFormat.setOutputPath(job, outputPath);

boolean success = job.waitForCompletion(true);
if (success) {
System.out.println(“Clean process success!”);
} else {
System.out.println(“Clean process failed!”);
}
return 0;
}

// 静态内部类
static class MyMapper extends Mapper {
LogParser logParser = new LogParser();
Text outputValue = new Text();

protected void map(LongWritable key, Text value,
org.apache.hadoop.mapreduce.Mapper.Context context)
throws java.io.IOException, InterruptedException {
final String[] parsed = logParser.parse(value.toString());

// step1.过滤掉静态资源访问请求
if (parsed[2].startsWith(“GET /static/”)
|| parsed[2].startsWith(“GET /uc_server”)) {
return;
}
// step2.过滤掉开头的指定字符串
if (parsed[2].startsWith(“GET /”)) {
parsed[2] = parsed[2].substring(“GET /”.length());
} else if (parsed[2].startsWith(“POST /”)) {
parsed[2] = parsed[2].substring(“POST /”.length());
}
// step3.过滤掉结尾的特定字符串
if (parsed[2].endsWith(” HTTP/1.1″)) {
parsed[2] = parsed[2].substring(0, parsed[2].length()
– ” HTTP/1.1″.length());
}
// step4.只写入前三个记录类型项
outputValue.set(parsed[0] + “\t” + parsed[1] + “\t” + parsed[2]);
context.write(key, outputValue);
}
}

// 静态内部类
static class MyReducer extends Reducer {
protected void reduce(LongWritable k2, java.lang.Iterable<Text> v2s,
org.apache.hadoop.mapreduce.Reducer.Context context)
throws java.io.IOException, InterruptedException {
for (Text v2 : v2s) {
context.write(v2, NullWritable.get());
}
};
}

/*
* 日志解析类静态内部类
*/
static class LogParser {
public static final SimpleDateFormat FORMAT = new SimpleDateFormat(
“d/MMM/yyyy:HH:mm:ss”, Locale.ENGLISH);
public static final SimpleDateFormat dateformat1 = new SimpleDateFormat(
“yyyyMMddHHmmss”);

public static void main(String[] args) throws ParseException {
final String S1 = “27.19.74.143 – – [30/May/2013:17:38:20 +0800] \”GET /static/image/common/faq.gif HTTP/1.1\” 200 1127″;
LogParser parser = new LogParser();
final String[] array = parser.parse(S1);
System.out.println(“样例数据： ” + S1);
System.out.format(
“解析结果： ip=%s, time=%s, url=%s, status=%s, traffic=%s”,
array[0], array[1], array[2], array[3], array[4]);
}

/**
* 解析英文时间字符串
*
* @param string
* @return
* @throws ParseException
*/
private Date parseDateFormat(String string) {
Date parse = null;
try {
parse = FORMAT.parse(string);
} catch (ParseException e) {
e.printStackTrace();
}
return parse;
}

/**
* 解析日志的行记录
*
* @param line
* @return 数组含有5个元素，分别是ip、时间、url、状态、流量
*/
public String[] parse(String line) {
String ip = parseIP(line);
String time = parseTime(line);
String url = parseURL(line);
String status = parseStatus(line);
String traffic = parseTraffic(line);
return new String[] { ip, time, url, status, traffic };
}

private String parseTraffic(String line) {
final String trim = line.substring(line.lastIndexOf(“\””) + 1)
.trim();
String traffic = trim.split(” “)[1];
return traffic;
}

private String parseStatus(String line) {
final String trim = line.substring(line.lastIndexOf(“\””) + 1)
.trim();
String status = trim.split(” “)[0];
return status;
}

private String parseURL(String line) {
final int first = line.indexOf(“\””);
final int last = line.lastIndexOf(“\””);
String url = line.substring(first + 1, last);
return url;
}

private String parseTime(String line) {
final int first = line.indexOf(“[“);
final int last = line.indexOf(“+0800]”);
String time = line.substring(first + 1, last).trim();
Date date = parseDateFormat(time);
return dateformat1.format(date);
}

private String parseIP(String line) {
String ip = line.split(“- -“)[0].trim();
return ip;
}
}
}

作者 east

Hadoop 12月 14,2018

hadoop实例3:合并相同数据

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* 需要把相同订单id的记录放在一个文件中，并以订单id命名
* @author Administrator
*
*/
public class MultipleOutputTest {

static class SortMapper extends
Mapper<LongWritable, Text, Text, Text> {

@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
try {
System.out.println(“Before Mapper: ” + value + “, ” + value);
String line = value.toString();
String[] fields = line.split(“,”);
System.out.println(“fields[0]: ” + fields[0] + “, fields[2]=”
+ Double.parseDouble(fields[2]));
context.write(new Text(fields[0]), value);
System.out.println(“After Mapper: “);
} catch (Exception ex) {
ex.printStackTrace();
}
}
}

static class SortReducer extends Reducer<Text, Text, NullWritable, Text> {
private MultipleOutputs<NullWritable, Text> multipleOutputs;

@Override
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
}

@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
System.out.print(“Before Reduce: ” + key + “, ” + key);
for (Text value : values) {
multipleOutputs.write(NullWritable.get(), value, key.toString());
}
}

@Override
protected void cleanup(
org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
multipleOutputs.close();
}

// 主函数
public static void main(String[] args) throws Exception {
// 获取配置参数
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
// 检查命令语法
if (otherArgs.length != 2) {
System.err.println(“Usage: Dedup <in> <out>”);
System.exit(2);
}
// 定义作业对象
Job job = new Job(conf, “multiple”);
// 注册分布式类
job.setJarByClass(MultipleOutputTest.class);
// 注册输出格式类
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 注册Mapper类
job.setMapperClass(SortMapper.class);
// 注册Reducer类
job.setReducerClass(SortReducer.class);
// 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
// 运行程序
System.exit(job.waitForCompletion(true) ? 0 : 1);
}

}

作者 east

Hadoop 12月 14,2018

hadoop实例2:分析学生成绩(最高、最低、平均分)

输入数据:

computer,huangxiaoming,85
computer,xuzheng,54
computer,huangbo,86
computer,liutao,85
computer,huanglei,99
computer,liujialing,85
computer,liuyifei,75
computer,huangdatou,48
computer,huangjiaju,88
computer,huangzitao,85
english,zhaobenshan,57
english,liuyifei,85
english,liuyifei,76
english,huangdatou,48
english,zhouqi,85
english,huangbo,85
english,huangxiaoming,96
english,huanglei,85
english,liujialing,75
algorithm,liuyifei,75
algorithm,huanglei,76
algorithm,huangjiaju,85
algorithm,liutao,85
algorithm,huangdou,42
algorithm,huangzitao,81
math,wangbaoqiang,85
math,huanglei,76
math,huangjiaju,85
math,liutao,48
math,xuzheng,54
math,huangxiaoming,85
math,liujialing,85

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class StudentBean implements Writable{
private String course;
private long maxScore;
private long minScore;
private long avgScore;
private long score;

public StudentBean(){

}

public StudentBean(String course, long score){
this.course = course;
this.score = score;
}

public String getCourse() {
return course;
}

public void setCourse(String course) {
this.course = course;
}

public long getMaxScore() {
return maxScore;
}

public void setMaxScore(long maxScore) {
this.maxScore = maxScore;
}

public long getMinScore() {
return minScore;
}

public void setMinScore(long minScore) {
this.minScore = minScore;
}

public long getAvgScore() {
return avgScore;
}

public void setAvgScore(long avgScore) {
this.avgScore = avgScore;
}

public long getScore() {
return score;
}

public void setScore(long score) {
this.score = score;
}

@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
try{

// course = in.readUTF();
/*maxScore = in.readLong();
minScore = in.readLong();
avgScore = in.readLong();*/
score = in.readLong();
}catch(Exception ex){
ex.printStackTrace();
}

}

@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
//out.writeBytes(course);
// out.writeLong(maxScore);
// out.writeLong(minScore);
out.writeLong(score);
}

@Override
public String toString() {
// TODO Auto-generated method stub
return “\tmax=” + maxScore + “\tmin=” + minScore + “\tavg=” + avgScore;
}
}

import java.io.IOException;
/**
* 需要统计手机用户流量日志，日志内容实例：
要把同一个用户的上行流量、下行流量进行累加，并计算出综合。
*/
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;
import test.Dedup;
import test.Dedup.Map;
import test.Dedup.Reduce;

public class StudentScore {

static class FlowCountMapper extends
Mapper<LongWritable, Text, Text, StudentBean> {

@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
try {
String line = value.toString();
if (null != line && line.length() > 0) {
String[] fields = line.split(“,”);
String course = fields[0];
long score = Long.parseLong(fields[2]);
System.out.println(“map: course=” + course + ” score=”
+ score);
context.write(new Text(fields[0]), new StudentBean(course,
score));
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}

static class FlowCountReducer extends
Reducer<Text, StudentBean, Text, StudentBean> {

@Override
protected void reduce(
Text key,
Iterable<StudentBean> values,
org.apache.hadoop.mapreduce.Reducer<Text, StudentBean, Text, StudentBean>.Context context)
throws IOException, InterruptedException {
try {
// TODO Auto-generated method stub
long sum_score = 0;
long maxScore = Integer.MIN_VALUE;
long minScore = Integer.MAX_VALUE;
int index = 0;
for (StudentBean bean : values) {
System.out.println(“reduce: score=” + bean.getScore());
sum_score += bean.getScore();
index++;
if (maxScore < bean.getScore()) {
maxScore = bean.getScore();
}
if (minScore > bean.getScore()) {
minScore = bean.getScore();
}
}
System.out.println(“reduce: maxScore=” + maxScore
+ ” minScore=” + minScore);
StudentBean resultBean = new StudentBean();
resultBean.setAvgScore(sum_score / index);
resultBean.setMaxScore(maxScore);
resultBean.setMinScore(minScore);
context.write(key, resultBean);
} catch (Exception ex) {
ex.printStackTrace();
}
}

// 主函数
public static void main(String[] args) throws Exception {
// 获取配置参数
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
// 检查命令语法
if (otherArgs.length != 2) {
System.err.println(“Usage: Dedup <in> <out>”);
System.exit(2);
}
// 定义作业对象
Job job = new Job(conf, “FlowCount”);
// 注册分布式类
job.setJarByClass(StudentScore.class);
// 注册Mapper类
job.setMapperClass(FlowCountMapper.class);
// 注册Reducer类
job.setReducerClass(FlowCountReducer.class);
// 指定我们自定义的分区器
// job.setPartitionerClass(ProvincePartitioner.class);
// 同时指定相应分区数量的reducetask
// job.setNumReduceTasks(5);
// 注册输出格式类
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(StudentBean.class);
// 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
// 运行程序
System.exit(job.waitForCompletion(true) ? 0 : 1);
}

}

作者 east

Hadoop 12月 14,2018

hadoop实例1:采集的气象数据分析最高气温

输入数据:

2014010114
2014010216
2014010317
2014010410
2014010506
2012010609
2012010732
2012010812
2012010919
2012011023
2001010116
2001010212
2001010310
2001010411
2001010529
2013010619
2013010722
2013010812
2013010929
2013011023
2008010105
2008010216
2008010337
2008010414
2008010516
2007010619
2007010712
2007010812
2007010999
2007011023
2010010114
2010010216
2010010317
2010010410
2010010506
2015010649
2015010722
2015010812
2015010999
2015011023

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;

public class Temperature {
/**
* 四个泛型类型分别代表：
* KeyIn Mapper的输入数据的Key，这里是每行文字的起始位置（0,11,…）
* ValueIn Mapper的输入数据的Value，这里是每行文字
* KeyOut Mapper的输出数据的Key，这里是每行文字中的“年份”
* ValueOut Mapper的输出数据的Value，这里是每行文字中的“气温”
*/
static class TempMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
// 打印样本: Before Mapper: 0, 2000010115
System.out.println(“Before Mapper: ” + key + “, ” + value);
if(null == value || value.getLength() == 0){
return;
}
String line = value.toString();
String year = line.substring(0, 4);
int temperature = Integer.parseInt(line.substring(8));
context.write(new Text(year), new IntWritable(temperature));
// 打印样本: After Mapper:2000, 15
System.out.println(
“======”+
“After Mapper:” + new Text(year) + “, ” + new IntWritable(temperature));

}
}

/**
* 四个泛型类型分别代表：
* KeyIn Reducer的输入数据的Key，这里是每行文字中的“年份”
* ValueIn Reducer的输入数据的Value，这里是每行文字中的“气温”
* KeyOut Reducer的输出数据的Key，这里是不重复的“年份”
* ValueOut Reducer的输出数据的Value，这里是这一年中的“最高气温”
*/
static class TempReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
StringBuffer sb = new StringBuffer();
for(IntWritable value: values){
maxValue = Math.max(maxValue, value.get());
sb.append(value).append(“, “);
}
// 打印样本： Before Reduce: 2000, 15, 23, 99, 12, 22,
System.out.print(“Before Reduce: ” + key + “, ” + sb.toString());
context.write(key,new IntWritable(maxValue));
// 打印样本： After Reduce: 2000, 99
System.out.println(
“======”+
“After Reduce: ” + key + “, ” + maxValue);
}
}

public static void main(String[] args) throws Exception {
Configuration hadoopConfig = new Configuration();
String[] otherArgs = new GenericOptionsParser(hadoopConfig, args).getRemainingArgs();
// 检查命令语法
if(otherArgs.length != 2){
System.err.println(“Usage: Dedup <in> <out>”);
System.exit(2);
}
//输入路径
// String dst = “hdfs://localhost:9000/user/tminput/”;
//输出路径，必须是不存在的，空文件加也不行。
// String dstOut = “hdfs://localhost:9000/user/tmoutput/”;

/* hadoopConfig.set(“fs.hdfs.impl”,
org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()
);
hadoopConfig.set(“fs.file.impl”,
org.apache.hadoop.fs.LocalFileSystem.class.getName()
);*/
Job job = new Job(hadoopConfig, “Temperature”);
//如果需要打成jar运行，需要下面这句
job.setJarByClass(Temperature.class);

FileInputFormat.setInputPaths(job,new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));

//指定自定义的Mapper和Reducer作为两个阶段的任务处理类
job.setMapperClass(TempMapper.class);
job.setReducerClass(TempReducer.class);
//job执行作业时输入和输出文件的路径

//设置最后输出结果的Key和Value的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

//执行job，直到完成
job.waitForCompletion(true);
System.out.println(“Finished”);
}

}

作者 east

Spark 12月 14,2018

spark实例8:读取mysql数据

/**
 * Created by Administrator on 2017/11/6.
 */
public class SparkMysql {
    public static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(SparkMysql.class);

    public static void main(String[] args) {
        JavaSparkContext sparkContext = new JavaSparkContext(new SparkConf().setAppName("SparkMysql").setMaster("local[5]"));
        SQLContext sqlContext = new SQLContext(sparkContext);
        //读取mysql数据
        readMySQL(sqlContext);

        //停止SparkContext
        sparkContext.stop();
    }
    private static void readMySQL(SQLContext sqlContext){
        //jdbc.url=jdbc:mysql://localhost:3306/database
        String url = "jdbc:mysql://localhost:3306/test";
        //查找的表名
        String table = "hb_links";
        //增加数据库的用户名(user)密码(password),指定test数据库的驱动(driver)
        Properties connectionProperties = new Properties();
        connectionProperties.put("user","root");
        connectionProperties.put("password","168168");
        connectionProperties.put("driver","com.mysql.jdbc.Driver");

        //SparkJdbc读取Postgresql的products表内容
        System.out.println("读取rangfei数据库中的hb_links表内容");
        // 读取表中所有数据
      //  DataFrame jdbcDF = sqlContext.read().jdbc(url,table,connectionProperties).select("*");
        Dataset<Row> jdbcDF = sqlContext.read().jdbc(url,table,connectionProperties).select("*");

        //显示数据
        jdbcDF.show();
    }
}

作者 east

Spark 12月 14,2018

spark实例7:使用Spark SQL来分析网络日志

case class ApacheAccessLog(
                            ipAddress: String, // IP地址
                              clientId: String, // 客户端唯一标识符
                              userId: String, // 用户唯一标识符
                              serverTime: String, // 服务器时间
                              method: String, // 请求类型/方式
                              endpoint: String, // 请求的资源
                              protocol: String, // 请求的协议名称
                              responseCode: Int, // 请求返回值：比如：200、401
                              contentSize: Long // 返回的结果数据大小
                          ) {

}

object ApacheAccessLog {
  // regex
  // 64.242.88.10 - - [   07/Mar/2004:16:05:49 -0800       ]
  // "GET /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables HTTP/1.1"
  // 401 12846
  val PARTTERN =
  """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+|-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r

  def isValidatelogLine(log: String): Boolean = {
    val res = PARTTERN.findFirstMatchIn(log)
    if (res.isEmpty) {
      false
    } else {
      true
    }

  }

  def parseLogLine(log: String): ApacheAccessLog = {
    val res = PARTTERN.findFirstMatchIn(log)
    if (res.isEmpty) {
      throw new RuntimeException("Cannot parse log line: " + log)
    }
    val m = res.get

    ApacheAccessLog(
      m.group(1),
      m.group(2),
      m.group(3),
      m.group(4),
      m.group(5),
      m.group(6),
      m.group(7),
      m.group(8).toInt,
      m.group(9).toLong
    )
  }
}

import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkContext, SparkConf}

/**
  * Created by Administrator on 2017/4/25.
  */
object LogAnalysis {
  def main(args: Array[String]): Unit = {
    //sqlContext
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("log-analysis-sparksql")
    val sc = SparkContext.getOrCreate(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._ //如果不写，下面的转换不成功

    //transform
    val path = "E:\\newcode\\MyFirstProject\\data\\test.log"
    val rdd = sc.textFile(path)
    val apacheAccessDataFrame = rdd
      .filter(line => ApacheAccessLog.isValidatelogLine(line))
      .map(line => {
        ApacheAccessLog.parseLogLine(line)
      }).cache().toDF() //rdd转换为DataFrame

    //register temptable
    apacheAccessDataFrame.registerTempTable("log_analysis_temp_table")
      sqlContext.sql("select * from log_analysis_temp_table limit 1").show()

      //需求一：求contentSize的平均值，最大值以及最小值
      val resultDataFrame1 = sqlContext.sql(

      """
        |SELECT
        |AVG(contentSize) as avg_contentSize,
        |MAX(contentSize) as max_contentSize,
        |MIN(contentSize) as min_contentSize
        |FROM log_analysis_temp_table
      """.stripMargin
      )
      resultDataFrame1.show()

      //save                                         //save as HDFS
      val resultRdd = resultDataFrame1.map(row => {
        val avgSize = row.getAs[Double]("avg_contentSize")
        val minSize = row.getAs[Long]("min_contentSize")
        val maxSize = row.getAs[Long]("max_contentSize")
        (avgSize, minSize, maxSize)
      })
      resultRdd.rdd.saveAsTextFile(s"E:/newcode/MyFirstProject/data/output/sql_${System.currentTimeMillis()}")


    //需求二：求各个返回值出现的数据个数
    val resultDataFrame2 = sqlContext.sql(
      """
        |SELECT
        |responseCode AS code,
        |COUNT(1) AS count
        |FROM log_analysis_temp_table
        |GROUP BY responseCode
      """.stripMargin
    )
    resultDataFrame2.show()
    resultDataFrame2.repartition(1).write.format("com.databricks.spark.csv").save("E:\\newcode\\MyFirstProject\\data\\output\\responseCode")


    //需求三：求访问次数大于N的IP地址，并对黑名单进行限制
    val blackIP = Array("200-55-104-193.ds1.prima.net.ar", "10.0.0.153", "208-38-57-205.ip.cal.radiant.net")
    val N = 10
    val resultDataFrame3 = sqlContext.sql(
      s"""
         |SELECT
         |ipAddress AS ip,
         |COUNT(1) AS count
         |FROM log_analysis_temp_table
         |WHERE not(ipAddress in(${blackIP.map(ip => s"'${ip}'").mkString(",")}))
         |GROUP BY ipAddress
         |HAVING count>${N}
     """.stripMargin)
    resultDataFrame3.show()


    //需求四：求访问次数最多的前k个endpoint的值
    val k = 50
    val resultDataFrame4 = sqlContext.sql(
      s"""
         |SELECT
         |  t.endpoint,
         |  t.count
         |FROM(
         |SELECT
         |  endpoint,
         |  COUNT(1) AS count
         |FROM log_analysis_temp_table
         |GROUP BY endpoint) t
         |ORDER BY t.count DESC
         |limit ${k}
      """.stripMargin)
    resultDataFrame4.show()
    resultDataFrame4.repartition(1).write.format("com.databricks.spark.csv").save("E:\\newcode\\MyFirstProject\\data\\output\\maxendpoint")

  }
}

作者 east

Spark 12月 14,2018

spark实例6:kafka生产者和消费者实例

mvn配置:

<dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
      <version>2.4.0</version>
  </dependency>

<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.54</version>
</dependency>

<dependency>
    <groupId>redis.clients</groupId>
    <artifactId>jedis</artifactId>
    <version>2.9.0</version>
</dependency>



生产者代码:
import java.util.Properties
import com.alibaba.fastjson.JSONObject
import kafka.producer.{KeyedMessage, Producer, ProducerConfig}

import scala.util.Random

object OrderProducer {
def main(args: Array[String]): Unit = {

//Kafka参数设置
val topic = "order"
val brokers = "192.168.0.219:9092"
val props = new Properties()
props.put("metadata.broker.list", brokers)
props.put("serializer.class", "kafka.serializer.StringEncoder")
val kafkaConfig = new ProducerConfig(props)
//创建生产者
val producer = new Producer[String, String](kafkaConfig)

while (true) {
//随机生成10以内ID
val id = Random.nextInt(10)
//创建订单成交事件
val event = new JSONObject();
//商品ID
event.put("id", id)
//商品成交价格
event.put("price", Random.nextInt(10000))

//发送信息
producer.send(new KeyedMessage[String, String](topic, event.toString))
println("Message sent: " + event)
//随机暂停一段时间
Thread.sleep(Random.nextInt(100))
}
}
}


消费者代码：

import com.alibaba.fastjson.JSON
import kafka.click.RedisClient
import kafka.serializer.StringDecoder
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

/**
  * Created by Administrator on 2017/8/30.
  */
object OrderConsumer {
  //Redis配置
  val dbIndex = 0
  //每件商品总销售额
  val orderTotalKey = "app::order::total"
  //每件商品每分钟销售额
  val oneMinTotalKey = "app::order::product"
  //总销售额
  val totalKey = "app::order::all"


  def main(args: Array[String]): Unit = {

    // 创建 StreamingContext 时间片为1秒
    val conf = new SparkConf().setMaster("local").setAppName("UserClickCountStat")
    val ssc = new StreamingContext(conf, Seconds(1))

    // Kafka 配置
    val topics = Set("order")
    val brokers = "192.168.0.219:9092"
    /*val kafkaParams = Map[String, String](
      "metadata.broker.list" -> brokers,
      "serializer.class" -> "kafka.serializer.StringEncoder")
    */

    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "192.168.0.219:9092,192.168.0.220:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "use_a_separate_group_id_for_each_stream",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

   // val topics = Array("topicA", "topicB")
    val kafkaStream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )

    // 创建一个 direct stream
 //   val kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, kafkaParams, topics)

    //解析JSON
  //  val events = kafkaStream.flatMap(line => Some(JSON.parseObject(line._2)))
  val events = kafkaStream.flatMap(line => Some(JSON.parseObject(line.value())))

    // 按ID分组统计个数与价格总合
    val orders = events.map(x => (x.getString("id"), x.getLong("price"))).groupByKey().map(x => (x._1, x._2.size, x._2.reduceLeft(_ + _)))

    //输出
    orders.foreachRDD(x =>
      x.foreachPartition(partition =>
        partition.foreach(x => {

          println("id=" + x._1 + " count=" + x._2 + " price=" + x._3)

          //保存到Redis中
          val jedis = RedisClient.pool.getResource
          jedis.auth("123456")
          jedis.select(dbIndex)
          //每个商品销售额累加
          jedis.hincrBy(orderTotalKey, x._1, x._3)
          //上一分钟第每个商品销售额
          jedis.hset(oneMinTotalKey, x._1.toString, x._3.toString)
          //总销售额累加
          jedis.incrBy(totalKey, x._3)
          RedisClient.pool.returnResource(jedis)


        })
      ))


    ssc.start()
    ssc.awaitTermination()
  }

}

作者 east

Spark 12月 14,2018

spark实例5:找出词频最高的前K个词

输入数据：

Hello World Bye World
Hello Hadoop Bye Hadoop
Bye Hadoop Hello Hadoop

输出结果：

(Hadoop,4)
(Bye,3)
(Hello,3)
(World,2)

/**
  * 一是统计词频，二是找出词频最高的前K个词
  */
object TopNBasic {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName(TopNBasic.getClass.getSimpleName)
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val textFile = sc.textFile("E:\\newcode\\MyFirstProject\\data\\topn.txt")
    val words = textFile.flatMap(line => line.split(" "))
    val wordPairs = words.map(word => (word, 1))
    val wordCounts = wordPairs.reduceByKey((a,b) => a + b)
 // val groupRDD = textFile.map(line => (line.split(" ")(0),line.split(" ")(1).toInt)).groupByKey()
  //  val top2 = groupRDD.map(pair => (pair._1, pair._2.toList.sortWith(_>_).take(2)))
   val top2 = wordCounts.sortBy(_._2, false).take(4)

    println("wordCounts: ")
    top2.foreach(println)
  }
}

作者 east

Spark 12月 14,2018

spark实例4:求中位数

输入数据:

1 2 3 4 5 6 8 9 11 12 34

输出结果：

/**
  * 求中位数
  */

object Median {

  def main (args: Array[String]) {

    val conf = new SparkConf().setAppName("Median").setMaster("local")
    val sc = new SparkContext(conf)

    val data = sc.textFile("E:\\newcode\\MyFirstProject\\data\\Median.txt")

    val words = data.flatMap(_.split(" ")).map(word => word.toInt)
    words.collect().foreach(println)
    val number = words.map(word =>(word/4,word)).sortByKey()
    number.collect().foreach(println)
    val pariCount = words.map(word => (word/4,1)).reduceByKey(_+_).sortByKey()
    pariCount.collect().foreach(println)
    val count = words.count().toInt
    var mid =0
    if(count%2 != 0)
    {
      mid = count/2+1
    }else
    {
      mid = count/2
    }

    var temp =0
    var temp1= 0
    var index = 0
    val tongNumber = pariCount.count().toInt

    var foundIt = false
    for(i <- 0 to tongNumber-1 if !foundIt)
    {
      println(pariCount.collectAsMap()(i).toString);
      temp = temp + pariCount.collectAsMap()(i)
      temp1 = temp - pariCount.collectAsMap()(i)
      if(temp >= mid)
      {
        index = i
        foundIt = true
      }
    }
    val tonginneroffset = mid - temp1

    val median = number.filter(_._1==index).takeOrdered(tonginneroffset)
    sc.setLogLevel("ERROR")
    println(median(tonginneroffset-1)._2)
    sc.stop()

  }
}

作者 east

Spark 12月 14,2018

spark实例3:倒排索引

   输入数据文件格式：
   cx1:a,b,c,d,e,f
   cx2:c,d,e,f
   cx3:a,b,c,f
   cx4:a,b,c,d,e,f
   cx5:a,b,e,f
   cx6:a,b,c,d
   cx7:a,b,c,f
   cx8:d,e,f
   cx9:b,c,d,e,f
  
  输出结果:
  a|cx1,cx3,cx4,cx5,cx6,cx7
  b|cx1,cx3,cx4,cx5,cx6,cx7,cx9
  c|cx1,cx2,cx3,cx4,cx6,cx7,cx9
  d|cx1,cx2,cx4,cx6,cx8,cx9
  e|cx1,cx2,cx4,cx5,cx8,cx9
  f|cx1,cx2,cx3,cx4,cx5,cx7,cx8,cx9

import scala.io.Source
object InvertedIndex {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName(InvertedIndex.getClass.getSimpleName)
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    /* 倒排索引InvertedIndex */

    val source = Source.fromFile("D:\\java\\spark\\data\\invertIndex.txt").getLines.toArray

    val cxRDD0 = sc.parallelize(source) /* spark单机读取数据 */

   val rdd2 = cxRDD0.flatMap {

      lines =>

        val line = lines.split(":", -1) /* 拆分数据，以竖杠为拆分条件 */

        line(1).split(",", -1).map {
          /* 再对拆分后的数据，进行第二次拆分 */

          v =>

            (v, line(0)) /* 拼接数据 */

        }

    }
    rdd2.collect().foreach {println}
     rdd2.groupByKey() /* 分组 */

      .sortBy(_._1, true) /* 排序 */

      .foreach(x => println(s"${x._1}|${x._2.mkString(",")}")) /* 格式化输出 */

  }
}

作者 east

Hbase 12月 14,2018

利用JavaAPI来操作Hbase

例子采用的是完全分布式集群，不是hbase自带的zookeeper，是独立的zookeeper

mvn的依赖如下：

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-client</artifactId>
    <version>1.4.8</version>
</dependency>

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-server</artifactId>
    <version>1.4.8</version>
</dependency>

在hbase中创建表、插入数据、查询数据等操作

import java.text.SimpleDateFormat
import java.util

import hbase.TestHbaeJavaApi.conf
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp
import org.apache.hadoop.hbase.filter.{FilterList, SingleColumnValueFilter}
import org.apache.hadoop.hbase.util.Bytes

/**
  * 利用JavaAPI来操作Hbase
  */
object HBaseTool {
  val zkQuorum = "192.168.0.219"
  val port = "2181"
  val table = "test"
  val cf = "cf1"
  val config = HBaseConfiguration.create()
  config.set("hbase.zookeeper.property.clientPort", "2181")
  config.set("hbase.zookeeper.quorum", "192.168.0.219")
  config.set("hbase.master", "192.168.0.219:600000")

  def putData(rowKey:String, cf:String = cf, kv:Seq[(String,String)]): Put ={
    val put = new Put(Bytes.toBytes(rowKey))
    kv.foreach{ kv =>
      put.addColumn(Bytes.toBytes(cf), Bytes.toBytes(kv._1), Bytes.toBytes(kv._2))
    }
    put
  }

  def getData(rowKey:String, qualifier:String =null): Get={
    val get = new Get(Bytes.toBytes(rowKey))
    if(qualifier != null)
      get.addColumn(Bytes.toBytes(cf), Bytes.toBytes(qualifier))
    get
  }

  def getScan(startRow:String, stopRow:String, filters:Map[String,String]= null, columns:Seq[String]= null): Scan= {
    var filterList: FilterList = null
    val scan = new Scan()
      .setStartRow(Bytes.toBytes(startRow))
      .setStopRow(Bytes.toBytes(stopRow))

    if(filters != null){
      filterList = getFilters(filters)
      if(filterList.getFilters.size() > 0)
        scan.setFilter(filterList)
    }

    if(columns != null) {
      columns.foreach{ column =>
        scan.addColumn(Bytes.toBytes(cf), Bytes.toBytes(column))
      }
    }

    scan
  }

  def getFilters(kv:Map[String,String]): FilterList = {
    val filterList = new FilterList()
    kv.toSeq.foreach{ kv =>
      val filter = new SingleColumnValueFilter(
        Bytes.toBytes(cf),
        Bytes.toBytes(kv._1),
        CompareOp.EQUAL,
        Bytes.toBytes(kv._2)
      )
      filter.setFilterIfMissing(true)
      filterList.addFilter(filter)
    }
    filterList
  }

  def main(args: Array[String]): Unit = {
    val rowKey = new SimpleDateFormat("yyyy-MM-dd").format(System.currentTimeMillis())
    val testSchema = Seq("id","name","age")
    val testData = Seq("10001","Jack","22")
    /**
      * 1. 插入数据到HBase
      */
    val hTable: HTable = new HTable(config, TableName.valueOf(table))
    hTable.setAutoFlush(false)
    hTable.setWriteBufferSize(10 * 1024 * 1024)
    //处理任务功能
    hTable.put(putData(rowKey, cf=cf, testSchema zip testData))

    hTable.flushCommits()

    /**
      * 2. 通过Get查询HBase
      */
    val result: Result = hTable.get(getData(rowKey))
    for(kv <- result.raw()) {
      println("key="+Bytes.toString(kv.getQualifier)+", value="+Bytes.toString(kv.getValue))
    }

    val result1: Result = hTable.get(getData(rowKey, testSchema.toList(2)))
    for(kv <- result1.raw()) {
      println("value="+Bytes.toString(kv.getValue))
    }

    /**
      * 3. 通过Scan查询HBase
      */
    val scan = getScan(rowKey, rowKey)
    val resultScan: ResultScanner = hTable.getScanner(scan)
    val ite: util.Iterator[Result] = resultScan.iterator()
    while(ite.hasNext) {
      val result = ite.next()
      for(kv <- result.raw()) {
        println("rowKey="+Bytes.toString(kv.getRow)+", key="+Bytes.toString(kv.getQualifier)+", value="+Bytes.toString(kv.getValue))
      }
    }

    hTable.close()
  }



}

作者 east

定时自动备份mysql数据库并删除过期备份