对一个网站进行流量分析,首先要做的就是数据采集;而采集的方式大至两种方式

  • nginx +lua 日志文件
  • 后台http get服务,实时push 到kafka

对于网站前端来说,数据上报通常有如下几种形式

直接向后台发送get请求,伪装成js或者图片请求
http://click.dangdang.com/page_tracker.php?m_id=&o_id=&region_ids=&out_refer=&refer_url=&url=http://www.dangdang.com/&to_url=&type=1&visit_id=20181119161826757207396759923945024&is_first_visit=0&ctr_type=&perm_id=20181114174025913676682755771693602&udid= &res=1920,1080||1903,5211&title=当当—网上购物中心:图书、母婴、美妆、家居、数码、家电、服装、鞋包等,正品低价,货到付款&trace_id=nohead&special=guan=1;page=id:1|name:当首;&cif=&rsv1=&rsv2=&rsv3=&platform=pc&r=0.857700135627224

https://a.stat.xiaomi.com/js/mstr.js?mid=&device_id=&phpsessid=&mstuid=1536571987936_2638&muuid=&mucid=&sessionId=1690051968&step=185&new_visitor=0&mstprevpid=&mstprev_pid_loc=&prevtarget=&lastsource=&timestamp=1542615493495&ref=&domain=.mi.com&screen=1920*1080&language=zh-CN&vendor=Google%20Inc.&platform=Win32&gu=&miwd=&edm_task=&masid=&client_id=&pu=&rf=0&mutid=&muwd=&domain_id=100&pageid=81190ccc4d52f577&curl=https%3A%2F%2Fwww.mi.com%2F&xmv=1536571987936_2638_1542615493495&v=1.0.0&vuuid=7ERAQ0IQQIBIFMAV

https://warriors.jd.com/log.gif?t=exp_log.100000&m=UA-J2011-1&pin=-&uid=1368883904&sid=1368883904|19&v={"t1":"pc_homepage","t2":"basic","p0":"{\"rept\":\"impr\",\"poi\":\"head|focus|08\",\"text\":\"11.19个护感恩节\",\"url\":\"//sale.jd.com/act/1dCqk7TBj5porf8.html\",\"desc\":\"个护电器\",\"mcinfo\":\"00755652-05703860-1100950352-M#0-2-1--58--#1-tb-#300-9908298#pc-home\",\"biclk\":\"1#6328b7df38f1cf2c1fd7c296f1e920cd7b603c53-101-619081#9908298\"}","pinid":"-","je":0,"sc":"24-bit","sr":"1920x1080","ul":"zh-cn","cs":"UTF-8","dt":"京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!","hn":"www.jd.com","fl":"-","os":"win","br":"chrome","bv":"68.0.3440.106","wb":"1536298255","xb":"1542165688","yb":1542615817,"zb":19,"cb":1,"usc":"direct","ucp":"-","umd":"none","uct":"-","ct":1542615839771,"lt":0,"tad":"-","jdv":"122270672|direct|-|none|-|1542165687598","dataver":"0.1"}&ref=&rm=1542615839772

回到数据收集端

nginx + lua

这种方式需要在nginx端配置日志格式;接收到前端日志收集请求后,会对请求解析,并将日志数据记录在本地磁盘;这种方式,有几个明显的缺点:

日志存储在本地磁盘,通常我们在做大数据离线分析,数据都是存储在hdfs上;所以这种方式就不可避免需要将日志上传到hdfs上去;因为是日志文件形式存储,所以没办法做实时的统计分析

后台收集

这个就需要开发一个日志收集服务端,提供一个http get服务;这个服务将上报的数据推送到kafka中;相比第一种方式,后台收集,你就不需要去各个服务器去收集日志文件;数据推送到kafka,也就意味着,我们可以使用storm,sparkstreaming进行实时分析;这个也是目前使用最广的方式

站点的数据采集流程【后台收集为例】

首先是数据上报前端;用过友盟统计和百度统计的同学都知道,想要使用友盟百度站点统计功能,首先要做的就是,在站点嵌入一段js或者html代码,大概像这个样子

    <script type="text/javascript">
        var _maq = new Array();
        _maq['_setAccount'] = 'uuid';
        _maq['ppppp'] = 'ppppp';
        (function () {
            var ma = document.createElement('script');
            ma.type = 'text/javascript';
            ma.async = true;
            ma.src = "http://localhost:8089/xmst.js";
            var s = document.getElementsByTagName('script')[0];
            s.parentNode.insertBefore(ma, s);

        })();
    </script>

这段代码的意思,就是动态加载远程的js[http://localhost:8089/xmst.js],嵌入到需要统计服务的站点;xmst.js代码如下

var params = {};
//Document对象数据
if (document) {
    params.domain = document.domain || ''; //获取域名
    params.url = document.URL || '';       //当前Url地址
    params.title = document.title || '';
    params.referrer = document.referrer || '';  //上一跳路径
}
//Window对象数据
if (window && window.screen) {
    params.sh = window.screen.height || 0;    //获取显示屏信息
    params.sw = window.screen.width || 0;
    params.cd = window.screen.colorDepth || 0;
}
//navigator对象数据
if (navigator) {
    params.lang = navigator.language || '';    //获取所用语言种类
}

params['age'] = '111'
//解析_maq配置
if (_maq) {
    for (var i in _maq) {                      //获取埋点阶段,传递过来的用户行为
        params[i] = _maq[i]
    }
};

function args_build(){
    //拼接参数串
    var args = '';
    for (var i in params) {
        // alert(i);
        if (args != '') {
            args += '&';
        }
        args += i + '=' + params[i];           //将所有获取到的信息进行拼接
    }

    return args;
};

//页面自动加载
function page_load(){
    //通过伪装成Image对象,请求后端脚本
    var img = new Image(1, 1);
    var src = 'http://localhost:8089/flow/log.gif?args=' + encodeURIComponent(args_build());
    // alert("请求到的后端脚本为" + src);
    img.src = src;
};


// 点击事件
function a_click(maps){
    //通过伪装成Image对象,请求后端脚本
    var img = new Image(1, 1);
    for (var i in maps) {
        params[i] = maps[i]
    }
    var src = 'http://localhost:8089/flow/log.gif?args=' + encodeURIComponent(args_build());
    img.src = src;
}
page_load();

加载了这个脚本的页面会自动调用page_load()方法,这个方法会将前端的数据伪装成一个长宽都为1像素img get请求,请求明文如下

页面浏览报文
http://localhost:8089/flow/log.gif?args=domain=localhost&url=http://localhost:8090/#&title=page test&referrer=&sh=1080&sw=1920&cd=24&lang=zh-CN&age=111&_setAccount=uuid&ppppp=ppppp

页面点击报文
http://localhost:8089/flow/log.gif?args=domain=localhost&url=http://localhost:8090/#&title=page test&referrer=&sh=1080&sw=1920&cd=24&lang=zh-CN&age=111&_setAccount=uuid&ppppp=ppppp&pageid=index.html&pcpid=pcpid

浏览和点击报文,区别在于pageid=index.html&pcpid=pcpid,pcpid定义为页面位置【例如点击了某个链接;触发了a_click(maps)方法】;

使用这种方式主要是为了解决跨域的问题,因为大多数情况下,统计脚本不单单为一个站点服务,域名也不可能全都一样;

服务端接口
http://localhost:8089/flow/log.gif?args=params

收集端代码如下【省略push kafka过程】

package com.fan.ga.gaserver.controller;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;

import javax.imageio.ImageIO;
import javax.servlet.http.HttpServletResponse;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.OutputStream;

@Controller
@RequestMapping("/flow")
public class LogCollector {
    Logger logger = LoggerFactory.getLogger(LogCollector.class);

//    http://localhost:8089/flow/log.gif?args=asfafd
    @RequestMapping(value = "log.gif")
    public void analysis(String args, HttpServletResponse response) throws IOException {
        logger.info(args);
        response.setHeader("Pragma", "No-cache");
        response.setHeader("Cache-Control", "no-cache");
        response.setDateHeader("Expires", 0);
        response.setContentType("image/gif");
        OutputStream out = response.getOutputStream();
        BufferedImage image = new BufferedImage(1, 1, BufferedImage.TYPE_INT_RGB);
        ImageIO.write(image, "gif", out);
        out.flush();
    }
}

站点index.html页面

<html>

<head>
    <meta charset="utf-8"/>
    <title>page test</title>
    <script type="text/javascript">
        var _maq = new Array();
        _maq['_setAccount'] = 'uuid';
        _maq['ppppp'] = 'ppppp';
        (function () {
            var ma = document.createElement('script');
            ma.type = 'text/javascript';
            ma.async = true;
            ma.src = "http://localhost:8089/xmst.js";
            var s = document.getElementsByTagName('script')[0];
            s.parentNode.insertBefore(ma, s);

        })();
    </script>

</head>

<body>
首页
<a href="#" onclick="a_click({ 'pageid': 'index.html', 'pcpid': 'pcpid' })">detail</a>
</body>
</html>

End

Logo

Kafka开源项目指南提供详尽教程,助开发者掌握其架构、配置和使用,实现高效数据流管理和实时处理。它高性能、可扩展,适合日志收集和实时数据处理,通过持久化保障数据安全,是企业大数据生态系统的核心。

更多推荐