初始化爬取最近7天的开奖结果任务
This commit is contained in:
@@ -18,9 +18,9 @@ public class BocaiApplication {
|
||||
// // 依次执行三个任务
|
||||
//
|
||||
// 1. 执行CrawlerSchedule方法
|
||||
// System.out.println("\n=== 开始执行CrawlerSchedule任务 ===");
|
||||
// CrawlerSchedule crawlerSchedule = context.getBean(CrawlerSchedule.class);
|
||||
// crawlerSchedule.executePksHistory();
|
||||
System.out.println("\n=== 开始执行初始化爬取最近7天的开奖结果任务 ===");
|
||||
CrawlerSchedule crawlerSchedule = context.getBean(CrawlerSchedule.class);
|
||||
crawlerSchedule.executeLotteryDrawHistory();
|
||||
//
|
||||
// 3. 执行ExBetScriptSchedule方法
|
||||
// System.out.println("\n=== 开始执行ExBetScriptSchedule任务 ===");
|
||||
|
||||
@@ -337,4 +337,135 @@ public class CrawlerSchedule {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//开始爬取最近7天的开奖结果
|
||||
public void executeLotteryDrawHistory() {
|
||||
log.info("开始爬取最近7天的开奖结果");
|
||||
|
||||
LoginInfoResult firstByOrderByCreateTimeDesc = loginInfoRepository.findFirstByOrderByCreateTimeDesc()
|
||||
.orElse(null);
|
||||
if (firstByOrderByCreateTimeDesc == null) {
|
||||
log.error("未找到登录信息");
|
||||
return;
|
||||
}
|
||||
if(firstByOrderByCreateTimeDesc.getOnOff() == ONOFF){
|
||||
log.info("开关已关闭,停止爬取");
|
||||
return;
|
||||
}
|
||||
|
||||
String token = tokenCacheService.getToken();
|
||||
if (token == null || token.isEmpty()) {
|
||||
log.error("token为空");
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取过去7天的日期列表
|
||||
List<String> dateList = DateUtils.getLast7Days();
|
||||
|
||||
for (String date : dateList) {
|
||||
log.info("\n=== 开始爬取日期: {} 的数据 ===", date);
|
||||
|
||||
// 检查该日期的数据文件是否已存在且有数据
|
||||
if (isDateDataExists(date)) {
|
||||
log.info("日期 {} 的数据已存在,跳过爬取", date);
|
||||
continue;
|
||||
}
|
||||
|
||||
// 对每个日期进行重试
|
||||
boolean success = crawlDataForDate(date, token);
|
||||
|
||||
if (success) {
|
||||
log.info("日期 {} 数据爬取成功", date);
|
||||
} else {
|
||||
log.error("日期 {} 数据爬取失败,已达到最大重试次数", date);
|
||||
}
|
||||
|
||||
// 每次请求后稍作等待,避免请求过于频繁
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
|
||||
log.info("最近7天数据爬取完成");
|
||||
}
|
||||
|
||||
/**
|
||||
* 爬取指定日期的数据
|
||||
*/
|
||||
private boolean crawlDataForDate(String date, String token) {
|
||||
int retryCount = 0;
|
||||
boolean success = false;
|
||||
String currentToken = token;
|
||||
|
||||
LoginInfoResult loginInfo = loginInfoRepository.findFirstByOrderByCreateTimeDesc()
|
||||
.orElse(null);
|
||||
if (loginInfo == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
while (!success && retryCount < MAX_CRA) {
|
||||
log.info("\n=== 第 " + (retryCount + 1) + " 次尝试获取 " + date + " 的开奖结果 ===");
|
||||
|
||||
if (currentToken == null || currentToken.isEmpty()) {
|
||||
log.info("token为空,从数据库重新获取");
|
||||
currentToken = tokenCacheService.getTokenSqlite();
|
||||
if (currentToken == null) {
|
||||
log.error("无法获取有效token");
|
||||
retryCount++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
log.info("使用token: " + (currentToken.length() > 20 ? currentToken.substring(0, 20) + "..." : currentToken));
|
||||
|
||||
// 创建爬虫实例,传入token
|
||||
LotteryHistoryCrawler crawler = new LotteryHistoryCrawler(currentToken, pypath,date);
|
||||
|
||||
// 构建URL
|
||||
String url = loginInfo.getLoginUrl() + "/member/dresult?lottery=SGFT&date=" + date;
|
||||
|
||||
Spider.create(crawler)
|
||||
.addUrl(url)
|
||||
.thread(1)
|
||||
.run();
|
||||
|
||||
// 检查是否成功解析数据
|
||||
success = LotteryHistoryCrawler.isLastParseSuccess();
|
||||
|
||||
if (!success) {
|
||||
log.info("本次尝试未解析到数据");
|
||||
// 重新获取token(下次重试用)
|
||||
currentToken = tokenCacheService.getTokenSqlite();
|
||||
retryCount++;
|
||||
|
||||
// 等待一下再重试
|
||||
if (retryCount < MAX_CRA) {
|
||||
try {
|
||||
Thread.sleep(2000 * retryCount); // 等待时间递增
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log.info("成功解析到数据");
|
||||
}
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
/**
|
||||
* 检查指定日期的数据文件是否存在且包含数据
|
||||
*/
|
||||
private boolean isDateDataExists(String date) {
|
||||
try {
|
||||
List<LotteryResult> data = lotteryResultRepository.findByTimeContaining(date);
|
||||
return data != null && !data.isEmpty();
|
||||
} catch (Exception e) {
|
||||
log.warn("检查文件失败: " + e.getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -72,6 +72,22 @@ public class DateUtils extends org.apache.commons.lang3.time.DateUtils
|
||||
return targetDate.format(DATE_FORMATTER);
|
||||
}*/
|
||||
|
||||
// 近7天日期的方法
|
||||
public static List<String> getLast7Days() {
|
||||
List<String> dateList = new ArrayList<>();
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
Calendar calendar = Calendar.getInstance();
|
||||
|
||||
// 从今天开始,往前推7天
|
||||
for (int i = 0; i < 7; i++) {
|
||||
dateList.add(sdf.format(calendar.getTime()));
|
||||
calendar.add(Calendar.DAY_OF_YEAR, -1);
|
||||
}
|
||||
|
||||
return dateList;
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
System.out.println("====="+getTodayDate());
|
||||
/* Date now = new Date(); // 当前时间
|
||||
|
||||
332
src/main/java/com/tem/bocai/util/LotteryHistoryCrawler.java
Normal file
332
src/main/java/com/tem/bocai/util/LotteryHistoryCrawler.java
Normal file
@@ -0,0 +1,332 @@
|
||||
package com.tem.bocai.util;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.SerializationFeature;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
//开奖的历史结果
|
||||
@Slf4j
|
||||
public class LotteryHistoryCrawler implements PageProcessor {
|
||||
|
||||
private final String token;
|
||||
// 站点配置
|
||||
private Site site;
|
||||
// final LoginService loginService;
|
||||
// 添加一个字段标记是否成功解析数据
|
||||
private static volatile boolean lastParseSuccess = true;
|
||||
|
||||
private String path;
|
||||
private String date;
|
||||
|
||||
public LotteryHistoryCrawler(String token, String path,String date) {
|
||||
this.token = token;
|
||||
this.path =path;
|
||||
this.date =date;
|
||||
initSite();
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化Site配置
|
||||
*/
|
||||
private void initSite() {
|
||||
site = Site.me()
|
||||
.setRetryTimes(3)
|
||||
.setSleepTime(1000)
|
||||
.setTimeOut(10000)
|
||||
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36");
|
||||
|
||||
// 设置cookie
|
||||
if (token != null && !token.isEmpty()) {
|
||||
site.addHeader("cookie", "token=" + token);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
// 获取页面HTML
|
||||
Html html = page.getHtml();
|
||||
|
||||
// 打印页面基本信息
|
||||
log.info("页面URL: " + page.getUrl());
|
||||
log.info("页面标题: " + html.xpath("//title/text()").get());
|
||||
// 示例:提取所有表格数据
|
||||
Selectable tables = html.xpath("//table");
|
||||
log.info("找到 " + tables.nodes().size() + " 个表格");
|
||||
if(tables.nodes().isEmpty()){
|
||||
lastParseSuccess = false;
|
||||
}else {
|
||||
lastParseSuccess = true;
|
||||
}
|
||||
// 提取表格数据(根据实际页面结构调整选择器)
|
||||
extractTableData(html);
|
||||
|
||||
// 示例:提取所有链接
|
||||
Selectable links = html.links();
|
||||
System.out.println("页面包含 " + links.all().size() + " 个链接");
|
||||
// 如果需要继续爬取其他页面
|
||||
// page.addTargetRequests(links.all());
|
||||
|
||||
// 将数据存入结果
|
||||
/* page.putField("html========", html.toString());
|
||||
page.putField("title", html.xpath("//title/text()").get());*/
|
||||
parseLotteryHtml(html.toString());
|
||||
|
||||
}
|
||||
|
||||
private void extractTableData(Html html) {
|
||||
// 根据实际页面结构编写数据提取逻辑
|
||||
// 示例:提取所有tr元素
|
||||
Selectable rows = html.xpath("//tr");
|
||||
for (Selectable row : rows.nodes()) {
|
||||
// 提取每行的td内容
|
||||
String rowText = row.xpath("//td/text()").all().toString();
|
||||
if (!rowText.isEmpty()) {
|
||||
System.out.println("行数据: " + rowText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
|
||||
return site;
|
||||
}
|
||||
|
||||
/**
|
||||
* 添加一个方法获取解析状态
|
||||
*/
|
||||
public static boolean isLastParseSuccess() {
|
||||
return lastParseSuccess;
|
||||
}
|
||||
/**
|
||||
* 解析彩票HTML数据,转换成指定的List<Map<String, Object>>格式
|
||||
*
|
||||
* @param htmlContent 爬取到的HTML文本内容
|
||||
* @return 解析后的结构化数据列表
|
||||
*/
|
||||
public List<Map<String, Object>> parseLotteryHtml(String htmlContent) {
|
||||
List<Map<String, Object>> resultList = new ArrayList<>();
|
||||
|
||||
// 初始化Jsoup解析器
|
||||
Document doc = Jsoup.parse(htmlContent);
|
||||
|
||||
// 定位到数据所在的表格行(drawTable下的table > tbody > tr)
|
||||
Element targetTable = doc.selectFirst("#drawTable");
|
||||
if (targetTable == null) {
|
||||
return resultList;
|
||||
}
|
||||
|
||||
Elements trList = targetTable.select("table > tbody > tr");
|
||||
|
||||
// 遍历每一行数据
|
||||
for (Element tr : trList) {
|
||||
Map<String, Object> rowData = new HashMap<>();
|
||||
|
||||
// 1. 提取期数(id)
|
||||
Element periodTd = tr.selectFirst("td.period");
|
||||
rowData.put("id", periodTd != null ? periodTd.text().trim() : "");
|
||||
|
||||
// 2. 提取开奖时间(time)
|
||||
Element timeTd = tr.selectFirst("td.drawTime");
|
||||
rowData.put("time", timeTd != null ? timeTd.text().trim() : "");
|
||||
|
||||
// 3. 提取开出号码(result)- 10个ballname的数字
|
||||
Elements ballTds = tr.select("td.ballname");
|
||||
List<Integer> resultNumbers = new ArrayList<>();
|
||||
int count = 0;
|
||||
for (Element td : ballTds) {
|
||||
if (count >= 10) break;
|
||||
String text = td.text().trim();
|
||||
if (text.matches("\\d+")) {
|
||||
resultNumbers.add(Integer.parseInt(text));
|
||||
count++;
|
||||
}
|
||||
}
|
||||
rowData.put("result", resultNumbers);
|
||||
|
||||
// 4. 提取winner(other1)
|
||||
Element winnerTd = tr.selectFirst("td.other1");
|
||||
if (winnerTd != null) {
|
||||
String winnerText = winnerTd.text().trim();
|
||||
if (winnerText.matches("\\d+")) {
|
||||
rowData.put("winner", Integer.parseInt(winnerText));
|
||||
} else {
|
||||
rowData.put("winner", "");
|
||||
}
|
||||
} else {
|
||||
rowData.put("winner", "");
|
||||
}
|
||||
|
||||
// 5. 提取GD1(冠亚小/大)、GD2(冠亚单/双)
|
||||
Elements otherTds = tr.select("td.other");
|
||||
String gd1 = "";
|
||||
String gd2 = "";
|
||||
for (Element td : otherTds) {
|
||||
String className = td.className();
|
||||
if (className.contains("GDX")) {
|
||||
gd1 = td.text().trim();
|
||||
} else if (className.contains("GDS")) {
|
||||
gd2 = td.text().trim();
|
||||
}
|
||||
}
|
||||
rowData.put("GD1", gd1);
|
||||
rowData.put("GD2", gd2);
|
||||
|
||||
// 6. 提取sum1(dldhl_sum)、sum2(dldhh_sum)
|
||||
Element sum1Td = tr.selectFirst("td.dldhl_sum");
|
||||
if (sum1Td != null) {
|
||||
String sum1Text = sum1Td.text().trim();
|
||||
if (sum1Text.matches("\\d+")) {
|
||||
rowData.put("sum1", Integer.parseInt(sum1Text));
|
||||
} else {
|
||||
rowData.put("sum1", "");
|
||||
}
|
||||
} else {
|
||||
rowData.put("sum1", "");
|
||||
}
|
||||
|
||||
Element sum2Td = tr.selectFirst("td.dldhh_sum");
|
||||
if (sum2Td != null) {
|
||||
String sum2Text = sum2Td.text().trim();
|
||||
if (sum2Text.matches("\\d+")) {
|
||||
rowData.put("sum2", Integer.parseInt(sum2Text));
|
||||
} else {
|
||||
rowData.put("sum2", "");
|
||||
}
|
||||
} else {
|
||||
rowData.put("sum2", "");
|
||||
}
|
||||
|
||||
// 7. 提取GLH_result(龙虎结果,5个GLH开头的td)
|
||||
List<String> glhResults = new ArrayList<>();
|
||||
int glhCount = 0;
|
||||
for (Element td : otherTds) {
|
||||
if (glhCount >= 5) break;
|
||||
String className = td.className();
|
||||
if (className.contains("GLH_")) {
|
||||
glhResults.add(td.text().trim());
|
||||
glhCount++;
|
||||
}
|
||||
}
|
||||
rowData.put("GLH_result", glhResults);
|
||||
|
||||
// 将单行数据加入结果列表(只保留有期数的有效行)
|
||||
if (!rowData.get("id").toString().isEmpty()) {
|
||||
resultList.add(rowData);
|
||||
}
|
||||
}
|
||||
// 将数据写入SQLite数据库
|
||||
SQLiteUtil.writeToSQLite(resultList);
|
||||
// 将数据写入JSON文件(保留原有功能)
|
||||
writeToJsonFile(resultList);
|
||||
log.info("历史爬虫打印结果===" + resultList);
|
||||
return resultList;
|
||||
}
|
||||
|
||||
public void writeToJsonFile(List<Map<String, Object>> resultList) {
|
||||
try {
|
||||
// 创建 ObjectMapper 实例
|
||||
ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
// 设置 JSON 格式化(可选,更易读)
|
||||
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
|
||||
|
||||
// 定义输出目录
|
||||
String directoryPath = path+"/current_data"; // 项目根目录下的 output/json 文件夹
|
||||
|
||||
// 使用年月日作为文件名(格式:result_yyyyMMdd.json)
|
||||
String fileName = "result_" + date + ".json";
|
||||
String filePath = directoryPath + "/" + fileName;
|
||||
|
||||
// 创建目录(如果不存在)
|
||||
File directory = new File(directoryPath);
|
||||
if (!directory.exists()) {
|
||||
directory.mkdirs(); // 创建多级目录
|
||||
}
|
||||
|
||||
// 创建文件对象
|
||||
File outputFile = new File(filePath);
|
||||
|
||||
// 如果文件已存在,读取现有数据并对比
|
||||
List<Map<String, Object>> existingData = new ArrayList<>();
|
||||
Set<String> existingIds = new HashSet<>();
|
||||
if (outputFile.exists()) {
|
||||
try {
|
||||
existingData = objectMapper.readValue(outputFile,
|
||||
objectMapper.getTypeFactory().constructCollectionType(List.class, Map.class));
|
||||
for (Map<String, Object> item : existingData) {
|
||||
if (item.containsKey("id")) {
|
||||
existingIds.add(item.get("id").toString());
|
||||
}
|
||||
}
|
||||
log.info("已读取现有数据,共 " + existingData.size() + " 条记录");
|
||||
} catch (IOException e) {
|
||||
log.warn("读取现有文件失败,将覆盖写入: " + e.getMessage());
|
||||
existingIds.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// 筛选出新增的数据(id不在existingIds中的记录)
|
||||
List<Map<String, Object>> newData = new ArrayList<>();
|
||||
for (Map<String, Object> item : resultList) {
|
||||
if (item.containsKey("id")) {
|
||||
String id = item.get("id").toString();
|
||||
if (!existingIds.contains(id)) {
|
||||
newData.add(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 合并现有数据和新数据
|
||||
List<Map<String, Object>> finalData = new ArrayList<>();
|
||||
if (!existingData.isEmpty()) {
|
||||
finalData.addAll(existingData);
|
||||
}
|
||||
finalData.addAll(newData);
|
||||
|
||||
// 将合并后的数据写入 JSON 文件
|
||||
objectMapper.writeValue(outputFile, finalData);
|
||||
log.info("数据已成功写入文件: " + outputFile.getAbsolutePath() +
|
||||
" (现有: " + existingData.size() + " 条, 新增: " + newData.size() + " 条, 总计: " + finalData.size() + " 条)");
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
log.error("写入 JSON 文件失败: " + e.getMessage(), e);
|
||||
throw new RuntimeException("写入 JSON 文件失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-02-06";
|
||||
|
||||
// 创建爬虫
|
||||
Spider.create(new LotteryHistoryCrawler("","",""))
|
||||
.addUrl(url) // 添加起始URL
|
||||
.thread(1) // 线程数
|
||||
.run(); // 开始爬取
|
||||
}
|
||||
|
||||
|
||||
// 自定义headers
|
||||
/*private Map<String, String> getHeaders() {
|
||||
Map<String, String> headers = new HashMap<>();
|
||||
headers.put("cookie", "token=a1b219fe7e39374d6af532c56fdc911b76ae8f83");
|
||||
|
||||
return headers;
|
||||
}*/
|
||||
}
|
||||
Reference in New Issue
Block a user