Spark-SparkStreaming+Kafka+zookeeper 手动管理offsets

发布于:2021-10-22 12:17:47

一、概述


SparkStreaming以Kafka作为数据源,手动管理offsets保存在zookeeper中。SparkStreamingOnKafka(入口)。KafkaZookeeperUtils(获取数据和更新offsets工具类)。

二、代码


1、SparkStreamingOnKafka


package com.cfl.spark.streaming;

import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import com.cfl.spark.streaming.KafkaZookeeperUtils.DataCallBack;

/**
* 从kafka中获取数据
* @author chenfenli
*
*/
public class SparkStreamingOnKafka {

private static String kafkaServer = "192.168.1.103:9092";
private static String zkServer = "192.168.1.103:2181";
private static String groupId = "test6";
private static String topic = "t0407";

public static void main(String[] args) {
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("SparkStreamingOnKafka");
sparkConf.setMaster("local");
JavaStreamingContext context = new JavaStreamingContext(sparkConf,Durations.seconds(5));

KafkaZookeeperUtils.getData(context, kafkaServer, zkServer, groupId, topic, new DataCallBack() {
@Override
public boolean data(List lines) {
// 返回 true:更新offsets false:不更新offsets
try {
for(String line : lines) {
System.out.println(line);
}
return true;
} catch (Exception e) {
System.out.println(e);
return false;
}
}
});
context.start();
context.awaitTermination();
context.stop();
}
}

?2、KafkaZookeeperUtils


package com.cfl.spark.streaming;

import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import org.I0Itec.zkclient.ZkClient;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder;
import kafka.utils.ZKGroupTopicDirs;
import kafka.utils.ZkUtils;
import scala.Tuple2;

public class KafkaZookeeperUtils implements Serializable{

private static final long serialVersionUID = 1L;

public static void getData(JavaStreamingContext context, String kafkaServer, String zkServer, String groupId, String topic, DataCallBack dataCallBack) {

final AtomicReference offsetRanges = new AtomicReference<>();

Map kafkaParameters = new HashMap<>();
kafkaParameters.put("metadata.broker.list", kafkaServer);
HashSet topis = new HashSet<>();
topis.add(topic);

// 获取当前offsets
ZKGroupTopicDirs zgt=new ZKGroupTopicDirs(groupId,topic);
final String zkTopicPath=zgt.consumerOffsetDir();
ZkClient zkClient=new ZkClient(zkServer);
int countChildren=zkClient.countChildren(zkTopicPath);
Map fromOffsets=new HashMap<>();
for (int i = 0; i < countChildren; i++) {
String path=zkTopicPath+"/"+i;
String offset=zkClient.readData(path);
TopicAndPartition topicAndPartition=new TopicAndPartition(topic,i);
fromOffsets.put(topicAndPartition,Long.parseLong(offset));
}
zkClient.close();

// 创建链接对象
if(fromOffsets.size() > 0) {
// 非第一次消费
JavaInputDStream inputDStream = KafkaUtils.createDirectStream(context, String.class, String.class, StringDecoder.class, StringDecoder.class, String.class, kafkaParameters, fromOffsets, new Function, String>() {
private static final long serialVersionUID = 1L;
@Override
public String call(MessageAndMetadata arg0) throws Exception {
return arg0.message();
}
});
JavaDStream dStream = inputDStream.transform(new Function, JavaRDD>() {
private static final long serialVersionUID = 1L;
@Override
public JavaRDD call(JavaRDD arg0) throws Exception {
OffsetRange[] offsets = ((HasOffsetRanges) arg0.rdd()).offsetRanges();
offsetRanges.set(offsets);
return arg0;
}
});
dStream.foreachRDD(new VoidFunction>() {
private static final long serialVersionUID = 1L;
@Override
public void call(JavaRDD arg0) throws Exception {
// 这里业务
boolean flag = dataCallBack.data(arg0.collect());

// 更新offsets
if(flag) {
ZkClient zkClient2 = new ZkClient(zkServer);
OffsetRange[] offsets = offsetRanges.get();
if (null != offsets) {
for (OffsetRange o : offsets) {
String zkPath = zkTopicPath + "/" + o.partition();
ZkUtils.updatePersistentPath(zkClient2, zkPath, o.untilOffset() + "");
}
}
zkClient2.close();
}
}
});
} else {
// 第一次消费: 从最大偏移量开始消费,如果想从第一条数据开始消费,需手动初始化fromOffsets,调用非第一次消费的方法
KafkaUtils.createDirectStream(context, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParameters, topis)
.transformToPair(new Function, JavaPairRDD>() {
private static final long serialVersionUID = 1L;
@Override
public JavaPairRDD call(JavaPairRDD arg0) throws Exception {
OffsetRange[] offsets = ((HasOffsetRanges) arg0.rdd()).offsetRanges();
offsetRanges.set(offsets);
return arg0;
}
})
.map(new Function, String>() {
private static final long serialVersionUID = 1L;
@Override
public String call(Tuple2 arg0) throws Exception {
return arg0._2;
}
})
.foreachRDD(new VoidFunction>() {
private static final long serialVersionUID = 1L;
@Override
public void call(JavaRDD arg0) throws Exception {
// 这里业务
boolean flag = dataCallBack.data(arg0.collect());

// 更新offsets
if(flag) {
ZkClient zkClient2 = new ZkClient(zkServer);
OffsetRange[] offsets = offsetRanges.get();
if (null != offsets) {
for (OffsetRange o : offsets) {
String zkPath = zkTopicPath + "/" + o.partition();
ZkUtils.updatePersistentPath(zkClient2, zkPath, o.untilOffset() + "");
}
}
zkClient2.close();
}
}
});
}
}

interface DataCallBack {
boolean data(List lines);
}
}

?

相关推荐

最新更新

猜你喜欢