MR Web日志预处理

发布 : 2016-03-12 分类 : 大数据 浏览 :

需求:

1
2
3
对web访问日志中的各字段识别切分
去除日志中不合法的记录
根据KPI统计需求,生成各类访问请求过滤数据

Markdown

bean

1
定义一个bean,用来记录日志数据中的各数据字段
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
package com.matrix.weblog;

public class WebLogBean {

private String remote_addr;// 记录客户端的ip地址
private String remote_user;// 记录客户端用户名称,忽略属性"-"
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的url与http协议
private String status;// 记录请求状态;成功是200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息

private boolean valid = true;// 判断数据是否合法

public String getRemote_addr() {
return remote_addr;
}

public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}

public String getRemote_user() {
return remote_user;
}

public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}

public String getTime_local() {
return time_local;
}

public void setTime_local(String time_local) {
this.time_local = time_local;
}

public String getRequest() {
return request;
}

public void setRequest(String request) {
this.request = request;
}

public String getStatus() {
return status;
}

public void setStatus(String status) {
this.status = status;
}

public String getBody_bytes_sent() {
return body_bytes_sent;
}

public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}

public String getHttp_referer() {
return http_referer;
}

public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}

public String getHttp_user_agent() {
return http_user_agent;
}

public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}

public boolean isValid() {
return valid;
}

public void setValid(boolean valid) {
this.valid = valid;
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.valid);
sb.append("\001").append(this.remote_addr);
sb.append("\001").append(this.remote_user);
sb.append("\001").append(this.time_local);
sb.append("\001").append(this.request);
sb.append("\001").append(this.status);
sb.append("\001").append(this.body_bytes_sent);
sb.append("\001").append(this.http_referer);
sb.append("\001").append(this.http_user_agent);
return sb.toString();
}
}

Parser

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
package com.matrix.weblog;

public class WebLogParser {

public static WebLogBean parser(String line) {
WebLogBean webLogBean = new WebLogBean();
String[] arr = line.split(" ");
if (arr.length > 11) {
webLogBean.setRemote_addr(arr[0]);
webLogBean.setRemote_user(arr[1]);
webLogBean.setTime_local(arr[3].substring(1));
webLogBean.setRequest(arr[6]);
webLogBean.setStatus(arr[8]);
webLogBean.setBody_bytes_sent(arr[9]);
webLogBean.setHttp_referer(arr[10]);

if (arr.length > 12) {
webLogBean.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
webLogBean.setHttp_user_agent(arr[11]);
}
if (Integer.parseInt(webLogBean.getStatus()) >= 400) {// 大于400,HTTP错误
webLogBean.setValid(false);
}
} else {
webLogBean.setValid(false);
}
return webLogBean;
}

public static String parserTime(String time) {

time.replace("/", "-");
return time;

}
}

Main

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package com.matrix.weblog;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WeblogPreProcess {

static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Text k = new Text();
NullWritable v = NullWritable.get();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

String line = value.toString();
WebLogBean webLogBean = WebLogParser.parser(line);
if (!webLogBean.isValid())
return;
k.set(webLogBean.toString());
context.write(k, v);

}

}

public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.230.10:8020");
conf.set("yarn.resourcemanager.hostname", "192.168.230.13");

FileSystem fs = FileSystem.get(//
new URI("hdfs://192.168.230.10:8020"), conf, "root");

Path outdir = new Path("/usr/matrix/output/weblog");
if (fs.exists(outdir)) {
fs.delete(outdir, true);
}

Job job = Job.getInstance(conf);
// 设置用户名称
job.setJobName("weblog");

// 指定我这个job所在jar包
job.setJarByClass(WeblogPreProcess.class);

job.setMapperClass(WeblogPreProcessMapper.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);

// 指定输入路径
FileInputFormat.addInputPath(job, new Path("/usr/matrix/input/weblog/"));
// 指定输出路径
FileOutputFormat.setOutputPath(job, outdir);

// 向yarn集群提交这个job
boolean res = job.waitForCompletion(true);
if (res) {
System.out.println("WordCount程序运行成功!");
}

}
}

运行结果

Markdown

Markdown

本文作者 : Matrix
原文链接 : https://matrixsparse.github.io/2016/03/12/MRWeb日志预处理/
版权声明 : 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明出处!

知识 & 情怀 | 二者兼得

微信扫一扫, 向我投食

微信扫一扫, 向我投食

支付宝扫一扫, 向我投食

支付宝扫一扫, 向我投食

留下足迹