初始化爬取最近7天的开奖结果任务

This commit is contained in:
xuelijun
2026-02-25 09:53:43 +08:00
parent 029109d559
commit 67335e08aa
4 changed files with 482 additions and 3 deletions

View File

@@ -18,9 +18,9 @@ public class BocaiApplication {
// // 依次执行三个任务
//
// 1. 执行CrawlerSchedule方法
// System.out.println("\n=== 开始执行CrawlerSchedule任务 ===");
// CrawlerSchedule crawlerSchedule = context.getBean(CrawlerSchedule.class);
// crawlerSchedule.executePksHistory();
System.out.println("\n=== 开始执行初始化爬取最近7天的开奖结果任务 ===");
CrawlerSchedule crawlerSchedule = context.getBean(CrawlerSchedule.class);
crawlerSchedule.executeLotteryDrawHistory();
//
// 3. 执行ExBetScriptSchedule方法
// System.out.println("\n=== 开始执行ExBetScriptSchedule任务 ===");

View File

@@ -337,4 +337,135 @@ public class CrawlerSchedule {
}
}
//开始爬取最近7天的开奖结果
public void executeLotteryDrawHistory() {
log.info("开始爬取最近7天的开奖结果");
LoginInfoResult firstByOrderByCreateTimeDesc = loginInfoRepository.findFirstByOrderByCreateTimeDesc()
.orElse(null);
if (firstByOrderByCreateTimeDesc == null) {
log.error("未找到登录信息");
return;
}
if(firstByOrderByCreateTimeDesc.getOnOff() == ONOFF){
log.info("开关已关闭,停止爬取");
return;
}
String token = tokenCacheService.getToken();
if (token == null || token.isEmpty()) {
log.error("token为空");
return;
}
// 获取过去7天的日期列表
List<String> dateList = DateUtils.getLast7Days();
for (String date : dateList) {
log.info("\n=== 开始爬取日期: {} 的数据 ===", date);
// 检查该日期的数据文件是否已存在且有数据
if (isDateDataExists(date)) {
log.info("日期 {} 的数据已存在,跳过爬取", date);
continue;
}
// 对每个日期进行重试
boolean success = crawlDataForDate(date, token);
if (success) {
log.info("日期 {} 数据爬取成功", date);
} else {
log.error("日期 {} 数据爬取失败,已达到最大重试次数", date);
}
// 每次请求后稍作等待,避免请求过于频繁
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
log.info("最近7天数据爬取完成");
}
/**
* 爬取指定日期的数据
*/
private boolean crawlDataForDate(String date, String token) {
int retryCount = 0;
boolean success = false;
String currentToken = token;
LoginInfoResult loginInfo = loginInfoRepository.findFirstByOrderByCreateTimeDesc()
.orElse(null);
if (loginInfo == null) {
return false;
}
while (!success && retryCount < MAX_CRA) {
log.info("\n=== 第 " + (retryCount + 1) + " 次尝试获取 " + date + " 的开奖结果 ===");
if (currentToken == null || currentToken.isEmpty()) {
log.info("token为空从数据库重新获取");
currentToken = tokenCacheService.getTokenSqlite();
if (currentToken == null) {
log.error("无法获取有效token");
retryCount++;
continue;
}
}
log.info("使用token: " + (currentToken.length() > 20 ? currentToken.substring(0, 20) + "..." : currentToken));
// 创建爬虫实例传入token
LotteryHistoryCrawler crawler = new LotteryHistoryCrawler(currentToken, pypath,date);
// 构建URL
String url = loginInfo.getLoginUrl() + "/member/dresult?lottery=SGFT&date=" + date;
Spider.create(crawler)
.addUrl(url)
.thread(1)
.run();
// 检查是否成功解析数据
success = LotteryHistoryCrawler.isLastParseSuccess();
if (!success) {
log.info("本次尝试未解析到数据");
// 重新获取token下次重试用
currentToken = tokenCacheService.getTokenSqlite();
retryCount++;
// 等待一下再重试
if (retryCount < MAX_CRA) {
try {
Thread.sleep(2000 * retryCount); // 等待时间递增
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
} else {
log.info("成功解析到数据");
}
}
return success;
}
/**
* 检查指定日期的数据文件是否存在且包含数据
*/
private boolean isDateDataExists(String date) {
try {
List<LotteryResult> data = lotteryResultRepository.findByTimeContaining(date);
return data != null && !data.isEmpty();
} catch (Exception e) {
log.warn("检查文件失败: " + e.getMessage());
return false;
}
}
}

View File

@@ -72,6 +72,22 @@ public class DateUtils extends org.apache.commons.lang3.time.DateUtils
return targetDate.format(DATE_FORMATTER);
}*/
// 近7天日期的方法
public static List<String> getLast7Days() {
List<String> dateList = new ArrayList<>();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Calendar calendar = Calendar.getInstance();
// 从今天开始往前推7天
for (int i = 0; i < 7; i++) {
dateList.add(sdf.format(calendar.getTime()));
calendar.add(Calendar.DAY_OF_YEAR, -1);
}
return dateList;
}
public static void main(String[] args) {
System.out.println("====="+getTodayDate());
/* Date now = new Date(); // 当前时间

View File

@@ -0,0 +1,332 @@
package com.tem.bocai.util;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.io.File;
import java.io.IOException;
import java.util.*;
//开奖的历史结果
@Slf4j
public class LotteryHistoryCrawler implements PageProcessor {
private final String token;
// 站点配置
private Site site;
// final LoginService loginService;
// 添加一个字段标记是否成功解析数据
private static volatile boolean lastParseSuccess = true;
private String path;
private String date;
public LotteryHistoryCrawler(String token, String path,String date) {
this.token = token;
this.path =path;
this.date =date;
initSite();
}
/**
* 初始化Site配置
*/
private void initSite() {
site = Site.me()
.setRetryTimes(3)
.setSleepTime(1000)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36");
// 设置cookie
if (token != null && !token.isEmpty()) {
site.addHeader("cookie", "token=" + token);
}
}
@Override
public void process(Page page) {
// 获取页面HTML
Html html = page.getHtml();
// 打印页面基本信息
log.info("页面URL: " + page.getUrl());
log.info("页面标题: " + html.xpath("//title/text()").get());
// 示例:提取所有表格数据
Selectable tables = html.xpath("//table");
log.info("找到 " + tables.nodes().size() + " 个表格");
if(tables.nodes().isEmpty()){
lastParseSuccess = false;
}else {
lastParseSuccess = true;
}
// 提取表格数据(根据实际页面结构调整选择器)
extractTableData(html);
// 示例:提取所有链接
Selectable links = html.links();
System.out.println("页面包含 " + links.all().size() + " 个链接");
// 如果需要继续爬取其他页面
// page.addTargetRequests(links.all());
// 将数据存入结果
/* page.putField("html========", html.toString());
page.putField("title", html.xpath("//title/text()").get());*/
parseLotteryHtml(html.toString());
}
private void extractTableData(Html html) {
// 根据实际页面结构编写数据提取逻辑
// 示例提取所有tr元素
Selectable rows = html.xpath("//tr");
for (Selectable row : rows.nodes()) {
// 提取每行的td内容
String rowText = row.xpath("//td/text()").all().toString();
if (!rowText.isEmpty()) {
System.out.println("行数据: " + rowText);
}
}
}
@Override
public Site getSite() {
return site;
}
/**
* 添加一个方法获取解析状态
*/
public static boolean isLastParseSuccess() {
return lastParseSuccess;
}
/**
* 解析彩票HTML数据转换成指定的List<Map<String, Object>>格式
*
* @param htmlContent 爬取到的HTML文本内容
* @return 解析后的结构化数据列表
*/
public List<Map<String, Object>> parseLotteryHtml(String htmlContent) {
List<Map<String, Object>> resultList = new ArrayList<>();
// 初始化Jsoup解析器
Document doc = Jsoup.parse(htmlContent);
// 定位到数据所在的表格行drawTable下的table > tbody > tr
Element targetTable = doc.selectFirst("#drawTable");
if (targetTable == null) {
return resultList;
}
Elements trList = targetTable.select("table > tbody > tr");
// 遍历每一行数据
for (Element tr : trList) {
Map<String, Object> rowData = new HashMap<>();
// 1. 提取期数id
Element periodTd = tr.selectFirst("td.period");
rowData.put("id", periodTd != null ? periodTd.text().trim() : "");
// 2. 提取开奖时间time
Element timeTd = tr.selectFirst("td.drawTime");
rowData.put("time", timeTd != null ? timeTd.text().trim() : "");
// 3. 提取开出号码result- 10个ballname的数字
Elements ballTds = tr.select("td.ballname");
List<Integer> resultNumbers = new ArrayList<>();
int count = 0;
for (Element td : ballTds) {
if (count >= 10) break;
String text = td.text().trim();
if (text.matches("\\d+")) {
resultNumbers.add(Integer.parseInt(text));
count++;
}
}
rowData.put("result", resultNumbers);
// 4. 提取winnerother1
Element winnerTd = tr.selectFirst("td.other1");
if (winnerTd != null) {
String winnerText = winnerTd.text().trim();
if (winnerText.matches("\\d+")) {
rowData.put("winner", Integer.parseInt(winnerText));
} else {
rowData.put("winner", "");
}
} else {
rowData.put("winner", "");
}
// 5. 提取GD1冠亚小/大、GD2冠亚单/双)
Elements otherTds = tr.select("td.other");
String gd1 = "";
String gd2 = "";
for (Element td : otherTds) {
String className = td.className();
if (className.contains("GDX")) {
gd1 = td.text().trim();
} else if (className.contains("GDS")) {
gd2 = td.text().trim();
}
}
rowData.put("GD1", gd1);
rowData.put("GD2", gd2);
// 6. 提取sum1dldhl_sum、sum2dldhh_sum
Element sum1Td = tr.selectFirst("td.dldhl_sum");
if (sum1Td != null) {
String sum1Text = sum1Td.text().trim();
if (sum1Text.matches("\\d+")) {
rowData.put("sum1", Integer.parseInt(sum1Text));
} else {
rowData.put("sum1", "");
}
} else {
rowData.put("sum1", "");
}
Element sum2Td = tr.selectFirst("td.dldhh_sum");
if (sum2Td != null) {
String sum2Text = sum2Td.text().trim();
if (sum2Text.matches("\\d+")) {
rowData.put("sum2", Integer.parseInt(sum2Text));
} else {
rowData.put("sum2", "");
}
} else {
rowData.put("sum2", "");
}
// 7. 提取GLH_result龙虎结果5个GLH开头的td
List<String> glhResults = new ArrayList<>();
int glhCount = 0;
for (Element td : otherTds) {
if (glhCount >= 5) break;
String className = td.className();
if (className.contains("GLH_")) {
glhResults.add(td.text().trim());
glhCount++;
}
}
rowData.put("GLH_result", glhResults);
// 将单行数据加入结果列表(只保留有期数的有效行)
if (!rowData.get("id").toString().isEmpty()) {
resultList.add(rowData);
}
}
// 将数据写入SQLite数据库
SQLiteUtil.writeToSQLite(resultList);
// 将数据写入JSON文件保留原有功能
writeToJsonFile(resultList);
log.info("历史爬虫打印结果===" + resultList);
return resultList;
}
public void writeToJsonFile(List<Map<String, Object>> resultList) {
try {
// 创建 ObjectMapper 实例
ObjectMapper objectMapper = new ObjectMapper();
// 设置 JSON 格式化(可选,更易读)
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
// 定义输出目录
String directoryPath = path+"/current_data"; // 项目根目录下的 output/json 文件夹
// 使用年月日作为文件名格式result_yyyyMMdd.json
String fileName = "result_" + date + ".json";
String filePath = directoryPath + "/" + fileName;
// 创建目录(如果不存在)
File directory = new File(directoryPath);
if (!directory.exists()) {
directory.mkdirs(); // 创建多级目录
}
// 创建文件对象
File outputFile = new File(filePath);
// 如果文件已存在,读取现有数据并对比
List<Map<String, Object>> existingData = new ArrayList<>();
Set<String> existingIds = new HashSet<>();
if (outputFile.exists()) {
try {
existingData = objectMapper.readValue(outputFile,
objectMapper.getTypeFactory().constructCollectionType(List.class, Map.class));
for (Map<String, Object> item : existingData) {
if (item.containsKey("id")) {
existingIds.add(item.get("id").toString());
}
}
log.info("已读取现有数据,共 " + existingData.size() + " 条记录");
} catch (IOException e) {
log.warn("读取现有文件失败,将覆盖写入: " + e.getMessage());
existingIds.clear();
}
}
// 筛选出新增的数据id不在existingIds中的记录
List<Map<String, Object>> newData = new ArrayList<>();
for (Map<String, Object> item : resultList) {
if (item.containsKey("id")) {
String id = item.get("id").toString();
if (!existingIds.contains(id)) {
newData.add(item);
}
}
}
// 合并现有数据和新数据
List<Map<String, Object>> finalData = new ArrayList<>();
if (!existingData.isEmpty()) {
finalData.addAll(existingData);
}
finalData.addAll(newData);
// 将合并后的数据写入 JSON 文件
objectMapper.writeValue(outputFile, finalData);
log.info("数据已成功写入文件: " + outputFile.getAbsolutePath() +
" (现有: " + existingData.size() + " 条, 新增: " + newData.size() + " 条, 总计: " + finalData.size() + " 条)");
} catch (IOException e) {
e.printStackTrace();
log.error("写入 JSON 文件失败: " + e.getMessage(), e);
throw new RuntimeException("写入 JSON 文件失败: " + e.getMessage(), e);
}
}
public static void main(String[] args) {
String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-02-06";
// 创建爬虫
Spider.create(new LotteryHistoryCrawler("","",""))
.addUrl(url) // 添加起始URL
.thread(1) // 线程数
.run(); // 开始爬取
}
// 自定义headers
/*private Map<String, String> getHeaders() {
Map<String, String> headers = new HashMap<>();
headers.put("cookie", "token=a1b219fe7e39374d6af532c56fdc911b76ae8f83");
return headers;
}*/
}