页面埋点
对一个网站进行流量分析,首先要做的就是数据采集;而采集的方式大至两种方式nginx +lua 日志文件后台http get服务,实时push 到kafka对于网站前端来说,数据上报通常有如下几种形式直接向后台发送get请求,伪装成js或者图片请求http://click.dangdang.com/page_tracker.php?m_id=&o_id=&am
对一个网站进行流量分析,首先要做的就是数据采集;而采集的方式大至两种方式
- nginx +lua 日志文件
- 后台http get服务,实时push 到kafka
对于网站前端来说,数据上报通常有如下几种形式
直接向后台发送get请求,伪装成js或者图片请求
http://click.dangdang.com/page_tracker.php?m_id=&o_id=®ion_ids=&out_refer=&refer_url=&url=http://www.dangdang.com/&to_url=&type=1&visit_id=20181119161826757207396759923945024&is_first_visit=0&ctr_type=&perm_id=20181114174025913676682755771693602&udid= &res=1920,1080||1903,5211&title=当当—网上购物中心:图书、母婴、美妆、家居、数码、家电、服装、鞋包等,正品低价,货到付款&trace_id=nohead&special=guan=1;page=id:1|name:当首;&cif=&rsv1=&rsv2=&rsv3=&platform=pc&r=0.857700135627224
https://a.stat.xiaomi.com/js/mstr.js?mid=&device_id=&phpsessid=&mstuid=1536571987936_2638&muuid=&mucid=&sessionId=1690051968&step=185&new_visitor=0&mstprevpid=&mstprev_pid_loc=&prevtarget=&lastsource=×tamp=1542615493495&ref=&domain=.mi.com&screen=1920*1080&language=zh-CN&vendor=Google%20Inc.&platform=Win32&gu=&miwd=&edm_task=&masid=&client_id=&pu=&rf=0&mutid=&muwd=&domain_id=100&pageid=81190ccc4d52f577&curl=https%3A%2F%2Fwww.mi.com%2F&xmv=1536571987936_2638_1542615493495&v=1.0.0&vuuid=7ERAQ0IQQIBIFMAV
https://warriors.jd.com/log.gif?t=exp_log.100000&m=UA-J2011-1&pin=-&uid=1368883904&sid=1368883904|19&v={"t1":"pc_homepage","t2":"basic","p0":"{\"rept\":\"impr\",\"poi\":\"head|focus|08\",\"text\":\"11.19个护感恩节\",\"url\":\"//sale.jd.com/act/1dCqk7TBj5porf8.html\",\"desc\":\"个护电器\",\"mcinfo\":\"00755652-05703860-1100950352-M#0-2-1--58--#1-tb-#300-9908298#pc-home\",\"biclk\":\"1#6328b7df38f1cf2c1fd7c296f1e920cd7b603c53-101-619081#9908298\"}","pinid":"-","je":0,"sc":"24-bit","sr":"1920x1080","ul":"zh-cn","cs":"UTF-8","dt":"京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!","hn":"www.jd.com","fl":"-","os":"win","br":"chrome","bv":"68.0.3440.106","wb":"1536298255","xb":"1542165688","yb":1542615817,"zb":19,"cb":1,"usc":"direct","ucp":"-","umd":"none","uct":"-","ct":1542615839771,"lt":0,"tad":"-","jdv":"122270672|direct|-|none|-|1542165687598","dataver":"0.1"}&ref=&rm=1542615839772
回到数据收集端
nginx + lua
这种方式需要在nginx端配置日志格式;接收到前端日志收集请求后,会对请求解析,并将日志数据记录在本地磁盘;这种方式,有几个明显的缺点:
日志存储在本地磁盘,通常我们在做大数据离线分析,数据都是存储在hdfs上;所以这种方式就不可避免需要将日志上传到hdfs上去;因为是日志文件形式存储,所以没办法做实时的统计分析
后台收集
这个就需要开发一个日志收集服务端,提供一个http get服务;这个服务将上报的数据推送到kafka中;相比第一种方式,后台收集,你就不需要去各个服务器去收集日志文件;数据推送到kafka,也就意味着,我们可以使用storm,sparkstreaming进行实时分析;这个也是目前使用最广的方式
站点的数据采集流程【后台收集为例】
首先是数据上报前端;用过友盟统计和百度统计的同学都知道,想要使用友盟百度站点统计功能,首先要做的就是,在站点嵌入一段js或者html代码,大概像这个样子
<script type="text/javascript">
var _maq = new Array();
_maq['_setAccount'] = 'uuid';
_maq['ppppp'] = 'ppppp';
(function () {
var ma = document.createElement('script');
ma.type = 'text/javascript';
ma.async = true;
ma.src = "http://localhost:8089/xmst.js";
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(ma, s);
})();
</script>
这段代码的意思,就是动态加载远程的js[http://localhost:8089/xmst.js],嵌入到需要统计服务的站点;xmst.js代码如下
var params = {};
//Document对象数据
if (document) {
params.domain = document.domain || ''; //获取域名
params.url = document.URL || ''; //当前Url地址
params.title = document.title || '';
params.referrer = document.referrer || ''; //上一跳路径
}
//Window对象数据
if (window && window.screen) {
params.sh = window.screen.height || 0; //获取显示屏信息
params.sw = window.screen.width || 0;
params.cd = window.screen.colorDepth || 0;
}
//navigator对象数据
if (navigator) {
params.lang = navigator.language || ''; //获取所用语言种类
}
params['age'] = '111'
//解析_maq配置
if (_maq) {
for (var i in _maq) { //获取埋点阶段,传递过来的用户行为
params[i] = _maq[i]
}
};
function args_build(){
//拼接参数串
var args = '';
for (var i in params) {
// alert(i);
if (args != '') {
args += '&';
}
args += i + '=' + params[i]; //将所有获取到的信息进行拼接
}
return args;
};
//页面自动加载
function page_load(){
//通过伪装成Image对象,请求后端脚本
var img = new Image(1, 1);
var src = 'http://localhost:8089/flow/log.gif?args=' + encodeURIComponent(args_build());
// alert("请求到的后端脚本为" + src);
img.src = src;
};
// 点击事件
function a_click(maps){
//通过伪装成Image对象,请求后端脚本
var img = new Image(1, 1);
for (var i in maps) {
params[i] = maps[i]
}
var src = 'http://localhost:8089/flow/log.gif?args=' + encodeURIComponent(args_build());
img.src = src;
}
page_load();
加载了这个脚本的页面会自动调用page_load()
方法,这个方法会将前端的数据伪装成一个长宽都为1像素img get请求,请求明文如下
页面浏览报文
http://localhost:8089/flow/log.gif?args=domain=localhost&url=http://localhost:8090/#&title=page test&referrer=&sh=1080&sw=1920&cd=24&lang=zh-CN&age=111&_setAccount=uuid&ppppp=ppppp
页面点击报文
http://localhost:8089/flow/log.gif?args=domain=localhost&url=http://localhost:8090/#&title=page test&referrer=&sh=1080&sw=1920&cd=24&lang=zh-CN&age=111&_setAccount=uuid&ppppp=ppppp&pageid=index.html&pcpid=pcpid
浏览和点击报文,区别在于pageid=index.html&pcpid=pcpid,pcpid定义为页面位置【例如点击了某个链接;触发了a_click(maps)方法】;
使用这种方式主要是为了解决跨域的问题,因为大多数情况下,统计脚本不单单为一个站点服务,域名也不可能全都一样;
服务端接口
http://localhost:8089/flow/log.gif?args=params
收集端代码如下【省略push kafka过程】
package com.fan.ga.gaserver.controller;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import javax.imageio.ImageIO;
import javax.servlet.http.HttpServletResponse;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.OutputStream;
@Controller
@RequestMapping("/flow")
public class LogCollector {
Logger logger = LoggerFactory.getLogger(LogCollector.class);
// http://localhost:8089/flow/log.gif?args=asfafd
@RequestMapping(value = "log.gif")
public void analysis(String args, HttpServletResponse response) throws IOException {
logger.info(args);
response.setHeader("Pragma", "No-cache");
response.setHeader("Cache-Control", "no-cache");
response.setDateHeader("Expires", 0);
response.setContentType("image/gif");
OutputStream out = response.getOutputStream();
BufferedImage image = new BufferedImage(1, 1, BufferedImage.TYPE_INT_RGB);
ImageIO.write(image, "gif", out);
out.flush();
}
}
站点index.html页面
<html>
<head>
<meta charset="utf-8"/>
<title>page test</title>
<script type="text/javascript">
var _maq = new Array();
_maq['_setAccount'] = 'uuid';
_maq['ppppp'] = 'ppppp';
(function () {
var ma = document.createElement('script');
ma.type = 'text/javascript';
ma.async = true;
ma.src = "http://localhost:8089/xmst.js";
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(ma, s);
})();
</script>
</head>
<body>
首页
<a href="#" onclick="a_click({ 'pageid': 'index.html', 'pcpid': 'pcpid' })">detail</a>
</body>
</html>
End
更多推荐
所有评论(0)