Flink消费Kafka插入Clickhouse


一、Maven依赖

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.11</artifactId>
            <version>1.13.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.13.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>1.13.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>3.1.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.11</artifactId>
            <version>1.13.1</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-simple</artifactId>
            <version>1.7.36</version>
        </dependency>

        <!-- json解析 -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>2.0.3</version>
        </dependency>

        <dependency>
            <groupId>ru.yandex.clickhouse</groupId>
            <artifactId>clickhouse-jdbc</artifactId>
            <version>0.3.2</version>
        </dependency>

二、Job类

 public static void main(String[] args) {
        try {
            /** 一、 Kafka Source Properties */
            Properties properties = new Properties();

            properties.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BROKERS);
            properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, GROUP_ID);
            //自动检测topic分区变化时间间隔
            properties.put("flink.partition-discovery.interval-millis", INTERVAL_MILLIS);
            properties.put("refresh.leader.backoff.ms", REFRESH_MS);

            /** 二、 Kafka Source */
            KafkaSource<String> kafkaSource = KafkaSource.<String>builder()
                    .setTopics(TOPIC)
                    .setValueOnlyDeserializer(new SimpleStringSchema())
                    .setProperties(properties)
                    .setStartingOffsets(OffsetsInitializer.latest())
                    .build();

            /** 三、 创建flink流式执行环境 */
            final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
            env.enableCheckpointing(120000L, CheckpointingMode.EXACTLY_ONCE);
            env.getCheckpointConfig().setMinPauseBetweenCheckpoints(180000L);
            env.getConfig().setAutoWatermarkInterval(0);

            /** 四、消费 Source */
            env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource_sourceName")
                    .uid("kafkaSource_sourceName_uid").name("kafkaSource_sourceName_name").setParallelism(1)

                    .flatMap(new KafkaSourceFlatMap()).uid("KafkaSourceFlatMap_uid").name("KafkaSourceFlatMap_name").setParallelism(1)
                    .flatMap(new ClickHouseSinkFlatMap(Constants.PARALL)).uid("ClickHouseSinkFlatMap_uid").name("ClickHouseSinkFlatMap_name").setParallelism(1);


            env.execute("KafkaToClickHouseJob");
        } catch (Exception e) {
            LOG.error("KafkaToClickHouseJob启动失败!", e);
        }
    }

二、Kafka FlatMap算子

public class KafkaSourceFlatMap extends RichFlatMapFunction<String, Map<String, String>> {
    private static final Logger log = LoggerFactory.getLogger(KafkaSourceFlatMap.class);

    @Override
    public void flatMap(String str, Collector<Map<String, String>> out) {
        try {
            //CK字段初始化
            String user_id;                                 
            String event_type;                              
            String label_value;                             
            String recv_date;
            String recv_time;                               
			//清洗逻辑...
			
            out.collect(listToMap(user_id.trim(),
                    event_type.trim(),
                    label_value.trim(),
                    recv_time));

    }
    
    public Map<String, String> listToMap(String... str) {
        Map<String, String> resMap = new HashMap<>();
        for (int i = 0; i < str.length; i++) {
            resMap.put((i + 1) + Constants.NULL, str[i]);
        }
        return resMap;
    }
} 

三、Clickhouse FlatMap算子

public class ClickHouseSinkFlatMap extends RichFlatMapFunction<Map<String, String>, String> {

    private static final Logger LOG = LoggerFactory.getLogger(ClickHouseSinkFlatMap.class);
    private int batch_size;
    private int count = 0;
    private int total = 0;
    private ClickHouseConnection connection = null;
    private PreparedStatement preparedStatement = null;

    public ClickHouseSinkFlatMap(int batch_size) {
        this.batch_size = batch_size;
    }

    @Override
    public void flatMap(Map<String, String> map, Collector<String> collector) {
        try {

            if (connection == null) {
                connection = getConn();

                preparedStatement = connection.prepareStatement(Constants.INSERT_SQL);
            }

            for (int i = 1; i < map.size()+1; i++) {
                    preparedStatement.setString(i, map.get(i + Constants.NULL));
            }

            preparedStatement.addBatch();

            count = count + 1;
            try {
                if (count >= batch_size) {
                    preparedStatement.executeBatch();

                    total += count;
                    LOG.info("已经成功插入{}条数据!!!",total);

                    preparedStatement.clearBatch();
                    count = 0;
                }
            } catch (Exception ee) {
                LOG.error("数据插入clickhouse 报错:", ee);
            }
        } catch (Exception ex) {
            LOG.error("ClickhouseSink插入报错====", ex);
        }
    }

    public static ClickHouseConnection getConn() {

        int socketTimeout = 600000;
        ClickHouseProperties properties = new ClickHouseProperties();
        properties.setUser(Constants.USERNAME);
        properties.setPassword(Constants.PASSWORD);
        properties.setDatabase(Constants.DATABASE);
        properties.setMaxRetries(3);
        properties.setSocketTimeout(socketTimeout);
        ClickHouseDataSource clickHouseDataSource = new ClickHouseDataSource(Constants.URL, properties);
        ClickHouseConnection conn = null;
        try {
            conn = clickHouseDataSource.getConnection();
            return conn;
        } catch (SQLException e) {
        }
        return null;
    }

}

四、Clickhouse建表

-- 本地表
CREATE TABLE IF NOT EXISTS rawdata.user_tag_local ON CLUSTER default
(
    user_id String,
    event_type String,
    label_value String,
    recv_date Date,
    recv_time DateTime64(3, 'Asia/Istanbul')
)
ENGINE = MergeTree()
PARTITION BY recv_date
ORDER BY recv_time


--分布式表
CREATE TABLE IF NOT EXISTS test.user_tag ON CLUSTER default
AS test.user_tag_local
Engine = Distributed("default" , "test" , "user_tag_local" , rand());

Logo

Kafka开源项目指南提供详尽教程,助开发者掌握其架构、配置和使用,实现高效数据流管理和实时处理。它高性能、可扩展,适合日志收集和实时数据处理,通过持久化保障数据安全,是企业大数据生态系统的核心。

更多推荐