• 2017/6/26补充:接手了搜索系统,这半年有了很多新的心得,懒改这篇粗鄙之文,大家看综合看这篇新博文来理解下面的粗鄙代码吧,http://blog.csdn.net/yujishi2/article/details/73849237
  • 背景:网上关于spark streaming的文章还是比较多的,可是大多数用scala实现,因我们的电商实时推荐项目以java为主,就踩了些坑,写了java版的实现,代码比较意识流,轻喷,欢迎讨论。
  • 流程:spark streaming从kafka读用户实时点击数据,过滤数据后从redis读商品相似度矩阵,从db读user历史行为,实时计算兴趣度,并将结果写入redis一份,供api层读取展示,写入hdfs一份供离线计算准确率召回率。
  • 补充:据了解,大型实时推荐系统里面,协同过滤一般用作生成候选集,计算兴趣读会被ctr等策略的 rerank代替,在calculateinterest中调用在线rerank服务排序。
  • 12/13补充:召回不变,目前采用ctr预估加上规则排序,后续上ltr。

  • 废话少说,上代码:

public class Main {
    static final String ZK_QUORUM = "*.*.*.*:2181,*.*.*.*:2181,*.*.*.*:2181/kafka";
    static final String GROUP = "test-consumer-group";
    static final String TOPICSS = "user_trace";
    static final String NUM_THREAD = "64";

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("main.java.computingCenter");
        // Create the context with 2 seconds batch size
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));

        int numThreads = Integer.parseInt(NUM_THREAD);
        Map<String, Integer> topicMap = new HashMap<String, Integer>();
        String[] topics = TOPICSS.split(",");
        for (String topic: topics) {
            topicMap.put(topic, numThreads);

        JavaPairReceiverInputDStream<String, String> messages =
                KafkaUtils.createStream(jssc, ZK_QUORUM, GROUP, topicMap);

        JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
            public String call(Tuple2<String, String> tuple2) {
                return tuple2._2();

        JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
            public Iterable<String> call(String lines) {
                List<String> arr = new ArrayList<String>();
                for (String s : lines.split(" ")) {
                    Map j = JSON.parseObject(s);
                    String s1 = "";
                    String s2 = "";
                    try {
                        s1 = URLDecoder.decode(j.get("Data").toString(), "UTF-8");
                        s2 = s1.split("=")[1];
                    } catch (UnsupportedEncodingException e) {
                return arr;

        JavaPairDStream<String, String> goodsSimilarityLists = words.filter(new Function<String, Boolean>() {
            public Boolean call(String s) throws Exception {
                if (s.split(":").length == 2) {
                    return true;
                return false;
        }).mapPartitionsToPair(new PairFlatMapFunction<Iterator<String>, String, String>() {
            public Iterable<Tuple2<String, String>> call(Iterator<String> s) throws Exception {
                ArrayList<Tuple2<String, String>> result = new ArrayList<Tuple2<String, String>>();
                while (s.hasNext()) {
                    String x = s.next();
                    String userId = x.split(":")[0];
                    String goodsId = x.split(":")[1];
                    LinkedHashMap<Long, Double> recommendMap = null;
                    try {
                        CalculateInterestService calculateInterestService = new CalculateInterestService();
                        try {
                            recommendMap = calculateInterestService.calculateInterest(userId, goodsId);
                        } catch (Exception e) {

                        String text = "";
                        int count = 0;
                        for (Map.Entry<Long, Double> entry : recommendMap.entrySet()) {
                            text = text + entry.getKey();
                            if (count == recommendMap.size() - 1) {
                            count = count + 1;
                            text = text + "{/c}";

                        text = System.currentTimeMillis() + ":" + text;
                        result.add(new Tuple2<String, String>(userId, text));
                    } catch (Exception e) {

                return result;

        goodsSimilarityLists.foreachRDD(new Function<JavaPairRDD<String, String>, Void>() {
            public Void call(JavaPairRDD<String, String> rdd) throws Exception {
                return null;

        JavaPairDStream<Text, Text> goodsSimilarityListsText = goodsSimilarityLists.mapToPair(new PairFunction<Tuple2<String, String>, Text, Text>(){
            public Tuple2<Text, Text> call(Tuple2<String, String> ori) throws Exception {
                return new Tuple2(new Text(ori._1), new Text(ori._2));

        goodsSimilarityListsText.saveAsHadoopFiles("/user/hadoop/recommend_list/rl", "123", Text.class, Text.class, SequenceFileOutputFormat.class);


public class CalculateInterestService {

    private String dictKey = "greate_item_sim_2.0";
    private String recommendTable = "great_recommend_table_2.0";
    static final String HIGO_BASE_URL = "jdbc:mysql://*.*.*.*:3212/*";
    static final String HIGO_BASE_USER = "*";
    static final String HIGO_BASE_PASS = "*";

    public LinkedHashMap<Long, Double> calculateInterest(String userId, String traceGoodsId) {
        LinkedHashMap<Long, Double> sortedMap = new LinkedHashMap<Long, Double>();
        String[] simGoods = RedisHelper.getInstance().hget(dictKey, traceGoodsId).split(",");
        HashMap<Long, String> userTrace = null;
        try {
            userTrace = getUserTrace(userId);
        } catch (ClassNotFoundException e) {
            return sortedMap;
        HashMap<Long, Double> recommendMap = new HashMap<Long, Double>();
        String[] simGoodsIds = new String[simGoods.length];
        for (int i = 0; i < simGoods.length; i++) {
            simGoodsIds[i] = simGoods[i].split(":")[0];
        List<String> pSimGoodsIds = RedisHelper.getInstance().hmget(dictKey, simGoodsIds);
        HashMap<Long, String> predictSimGoodsIds = new HashMap<Long, String>();
        for (int i = 0; i < simGoodsIds.length; i++) {
            predictSimGoodsIds.put(Long.parseLong(simGoodsIds[i]), pSimGoodsIds.get(i));
        for (String item : simGoods) {
            //need optimised

            Double totalSum = 0.0;
            Double sum = 0.0;
            Long originGoodsId = Long.parseLong(item.split(":")[0]);
            for (String predictGoods : predictSimGoodsIds.get(originGoodsId).split(",")) {
                Long goodsId = Long.parseLong(predictGoods.split(":")[0].toString());
                Double sim = Double.valueOf(predictGoods.split(":")[1].toString());
                totalSum = totalSum + sim;
                Double score = 0.0;
                if (!userTrace.containsKey(goodsId)) {
                    //TODO 用户评分矩阵过于稀疏,需要svd补充评分,暂时无评分score为默认0.1
                    userTrace.put(goodsId, "default");
                String action = userTrace.get(goodsId);

                if (action.equals("click")) {
                    score = 0.2;
                } else if (action.equals("favorate")) {

                } else if (action.equals("add_cart")) {
                    score = 0.6;
                } else if (action.equals("order")) {
                    score = 0.8;

                } else if (action.equals("default")) {

                    score = 0.1;
                //相似度词典应存 goodsid:sim格式,要重构
                sum = sum + score * sim;

            Double predictResult = sum / totalSum;
            recommendMap.put(originGoodsId, predictResult);

        //sort recommend list
        List<Map.Entry<Long, Double>> list = new ArrayList<Map.Entry<Long, Double>>(recommendMap.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<Long, Double>>() {
            public int compare(Map.Entry<Long, Double> o1, Map.Entry<Long, Double> o2) {
                return o2.getValue().compareTo(o1.getValue());

        Map.Entry<Long, Double> tmpEntry = null;
        Iterator<Map.Entry<Long, Double>> iter = list.iterator();
        while (iter.hasNext()) {
            tmpEntry = iter.next();
            sortedMap.put(tmpEntry.getKey(), tmpEntry.getValue());

        writeRecommendListToRedis(userId, sortedMap);

        return sortedMap;


    private HashMap<Long, String> getUserTrace(String userId) throws ClassNotFoundException {
        //SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
        PreparedStatement stmt = null;
        Connection conn = null;
        UserTrace userTrace = new UserTrace();
        try {
            conn = DriverManager.getConnection(HIGO_BASE_URL, HIGO_BASE_USER, HIGO_BASE_PASS);
            String sql = "select * from t_pandora_goods_record where account_id=" + userId;
            stmt = (PreparedStatement)conn.prepareStatement(sql);
            ResultSet rs = stmt.executeQuery();
            while(rs.next()) {
        } catch (Exception e) {

        String[] goodsActionTimestamp = userTrace.getGoodsIds().split(",");
        HashMap<Long, String> hm = new HashMap<Long, String>();
        for (String ac : goodsActionTimestamp) {
            Long goodsId = Long.parseLong(ac.split(":")[0]);
            //String action = ac.split(":")[1];
            //String timestamp = ac.split(":")[2];
            //hack 下一步要bi把用户历史行为写入表中, action:goodsId:timestamp格式, timestamp后期将参与权重计算
            String action = "click";
            hm.put(goodsId, action);
        return hm;

    private void writeRecommendListToRedis(String userId, LinkedHashMap<Long, Double> sortedMap) {
        String recommendList = "";
        int count = 0;
        for (Map.Entry<Long, Double> entry : sortedMap.entrySet()) {
            recommendList = recommendList + entry.getKey();
            if (count == sortedMap.size() - 1) {
            count = count + 1;
            recommendList = recommendList + ",";
        RedisHelper.getInstance().hset(recommendTable, userId, recommendList);


