大数据—疫情防控项目数据爬取(一)
一、创建SpringBoot项目可以直接选择导入SpringBoot,也可以选择Maven项目【Maven项目需要后期手动导入SpringBoot所依赖的环境】点击next进入下一阶段,然后选择SpringBoot的开发工具,Spring Boot DevTools和Lombok;在web里面可以选择Spring Web在Messaging里面选择Spring for Apache Kafka,后
·
一、创建SpringBoot项目
- 可以直接选择导入SpringBoot,也可以选择Maven项目【Maven项目需要后期手动导入SpringBoot所依赖的环境】
-
点击next进入下一阶段,然后选择SpringBoot的开发工具,Spring Boot DevTools和Lombok;
在web里面可以选择Spring Web
在Messaging里面选择Spring for Apache Kafka,后期如果需要其他的我们可以再加上 -
点击next,此步看看所在的目录是否正确;然后点击Finish完成新键的项目
二、环境准备
1、pom.xml
在pom.xml文件中添加以下的依赖数据
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.3.2.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>cn.itcast</groupId>
<artifactId>datasource</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>datasource</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.22</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
- 然后在src文件中,
mian->java->cn->itcast
目录里面有一个DatasourceApplication.java
文件;当我们添加依赖后,运行此文件,如果不报错,就代表我们的依赖搭建成功。 - 在
src->main->resources
目录里面有一个application.properties
文件,我们可以在这里面加一些服务器端口、Kafka集群的设置等等
server.port=9999
#kafka
#服务器地址
kafka.bootstrap.servers=node01:9092,node02:9092,node03:9092
#重试发送消息次数
kafka.retries_config=0
#批量发送的基本单位,默认16384Byte,即16KB
kafka.batch_size_config=4096
#批量发送延迟的上限
kafka.linger_ms_config=100
#buffer内存大小
kafka.buffer_memory_config=40960
#主题
kafka.topic=covid19
配置完成后,我们再次启动DatasourceApplication.java
文件,此时的服务器端口等配置就已经修改完成
2、创建爬虫的工具类和时间的工具类
- 在
datasource\src\main\java\cn\itcast\utils
里面创建一个爬虫的工具类
package cn.itcast.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
/**
* 封装HttpClient工具,方便爬取网页内容
*/
public abstract class HttpUtils {
private static PoolingHttpClientConnectionManager cm = null;//声明HttpClient管理器对象(HttpClient连接池)
private static List<String> userAgentList = null;
private static RequestConfig config = null;
//静态代码块会在类被加载的时候执行
static {
cm = new PoolingHttpClientConnectionManager();
cm.setMaxTotal(200);
cm.setDefaultMaxPerRoute(20);
config = RequestConfig.custom()
.setSocketTimeout(10000)
.setConnectTimeout(10000)
.setConnectionRequestTimeout(10000)
.build();
userAgentList = new ArrayList<String>();
userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15;rv:73.0) Gecko/20100101 Firefox/73.0");
userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3)AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15");
userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299");
userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0");
}
public static String getHtml(String url){
//1.从连接池中获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//2.创建HttpGet对象
HttpGet httpGet = new HttpGet(url);
//3.设置请求头和请求配置对象
httpGet.setConfig(config);
httpGet.setHeader("User-Agent",userAgentList.get(new Random().nextInt(userAgentList.size())));
CloseableHttpResponse response = null;
//4.发起请求
try {
response = httpClient.execute(httpGet);
//5.获取响应内容
if (response.getStatusLine().getStatusCode()==200){
String html = "";
if (response.getEntity()!=null){
html = EntityUtils.toString(response.getEntity(), "utf-8");
}
return html;
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public static void main(String[] args) {
String html = HttpUtils.getHtml("https://blog.csdn.net/qq_44096670/article/details/107594671");
System.out.println(html);
}
}
- 然后再在其目录中床架一个时间的工具类TimeUtils
package cn.itcast.utils;
import org.apache.commons.lang3.time.FastDateFormat;
/**
* 时间工具类
*/
public abstract class TimeUtils {
public static String format(Long timestamp,String pattern){
return FastDateFormat.getInstance(pattern).format(timestamp);
}
public static void main(String[] args) {
String format = TimeUtils.format(System.currentTimeMillis(), "yyyy-MM-dd");
System.out.println(format);
}
}
- 在
datasource\src\main\java\cn\itcast\
里面创建一个用于爬虫的包crawler
- 在
datasource\src\main\java\cn\itcast\
里面创建一个用于生成数据的包generator
更多推荐
已为社区贡献1条内容
所有评论(0)