From 67335e08aa70d2ca7b898ad34fa10ad16262a86b Mon Sep 17 00:00:00 2001 From: xuelijun <977662702@qq.com> Date: Wed, 25 Feb 2026 09:53:43 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E7=88=AC=E5=8F=96?= =?UTF-8?q?=E6=9C=80=E8=BF=917=E5=A4=A9=E7=9A=84=E5=BC=80=E5=A5=96?= =?UTF-8?q?=E7=BB=93=E6=9E=9C=E4=BB=BB=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/tem/bocai/BocaiApplication.java | 6 +- .../tem/bocai/schedules/CrawlerSchedule.java | 131 +++++++ .../java/com/tem/bocai/util/DateUtils.java | 16 + .../tem/bocai/util/LotteryHistoryCrawler.java | 332 ++++++++++++++++++ 4 files changed, 482 insertions(+), 3 deletions(-) create mode 100644 src/main/java/com/tem/bocai/util/LotteryHistoryCrawler.java diff --git a/src/main/java/com/tem/bocai/BocaiApplication.java b/src/main/java/com/tem/bocai/BocaiApplication.java index d65e6bc..c844efc 100644 --- a/src/main/java/com/tem/bocai/BocaiApplication.java +++ b/src/main/java/com/tem/bocai/BocaiApplication.java @@ -18,9 +18,9 @@ public class BocaiApplication { // // 依次执行三个任务 // // 1. 执行CrawlerSchedule方法 -// System.out.println("\n=== 开始执行CrawlerSchedule任务 ==="); -// CrawlerSchedule crawlerSchedule = context.getBean(CrawlerSchedule.class); -// crawlerSchedule.executePksHistory(); + System.out.println("\n=== 开始执行初始化爬取最近7天的开奖结果任务 ==="); + CrawlerSchedule crawlerSchedule = context.getBean(CrawlerSchedule.class); + crawlerSchedule.executeLotteryDrawHistory(); // // 3. 执行ExBetScriptSchedule方法 // System.out.println("\n=== 开始执行ExBetScriptSchedule任务 ==="); diff --git a/src/main/java/com/tem/bocai/schedules/CrawlerSchedule.java b/src/main/java/com/tem/bocai/schedules/CrawlerSchedule.java index 317a418..e96dd38 100644 --- a/src/main/java/com/tem/bocai/schedules/CrawlerSchedule.java +++ b/src/main/java/com/tem/bocai/schedules/CrawlerSchedule.java @@ -337,4 +337,135 @@ public class CrawlerSchedule { } } + + //开始爬取最近7天的开奖结果 + public void executeLotteryDrawHistory() { + log.info("开始爬取最近7天的开奖结果"); + + LoginInfoResult firstByOrderByCreateTimeDesc = loginInfoRepository.findFirstByOrderByCreateTimeDesc() + .orElse(null); + if (firstByOrderByCreateTimeDesc == null) { + log.error("未找到登录信息"); + return; + } + if(firstByOrderByCreateTimeDesc.getOnOff() == ONOFF){ + log.info("开关已关闭,停止爬取"); + return; + } + + String token = tokenCacheService.getToken(); + if (token == null || token.isEmpty()) { + log.error("token为空"); + return; + } + + // 获取过去7天的日期列表 + List dateList = DateUtils.getLast7Days(); + + for (String date : dateList) { + log.info("\n=== 开始爬取日期: {} 的数据 ===", date); + + // 检查该日期的数据文件是否已存在且有数据 + if (isDateDataExists(date)) { + log.info("日期 {} 的数据已存在,跳过爬取", date); + continue; + } + + // 对每个日期进行重试 + boolean success = crawlDataForDate(date, token); + + if (success) { + log.info("日期 {} 数据爬取成功", date); + } else { + log.error("日期 {} 数据爬取失败,已达到最大重试次数", date); + } + + // 每次请求后稍作等待,避免请求过于频繁 + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + log.info("最近7天数据爬取完成"); + } + + /** + * 爬取指定日期的数据 + */ + private boolean crawlDataForDate(String date, String token) { + int retryCount = 0; + boolean success = false; + String currentToken = token; + + LoginInfoResult loginInfo = loginInfoRepository.findFirstByOrderByCreateTimeDesc() + .orElse(null); + if (loginInfo == null) { + return false; + } + + while (!success && retryCount < MAX_CRA) { + log.info("\n=== 第 " + (retryCount + 1) + " 次尝试获取 " + date + " 的开奖结果 ==="); + + if (currentToken == null || currentToken.isEmpty()) { + log.info("token为空,从数据库重新获取"); + currentToken = tokenCacheService.getTokenSqlite(); + if (currentToken == null) { + log.error("无法获取有效token"); + retryCount++; + continue; + } + } + + log.info("使用token: " + (currentToken.length() > 20 ? currentToken.substring(0, 20) + "..." : currentToken)); + + // 创建爬虫实例,传入token + LotteryHistoryCrawler crawler = new LotteryHistoryCrawler(currentToken, pypath,date); + + // 构建URL + String url = loginInfo.getLoginUrl() + "/member/dresult?lottery=SGFT&date=" + date; + + Spider.create(crawler) + .addUrl(url) + .thread(1) + .run(); + + // 检查是否成功解析数据 + success = LotteryHistoryCrawler.isLastParseSuccess(); + + if (!success) { + log.info("本次尝试未解析到数据"); + // 重新获取token(下次重试用) + currentToken = tokenCacheService.getTokenSqlite(); + retryCount++; + + // 等待一下再重试 + if (retryCount < MAX_CRA) { + try { + Thread.sleep(2000 * retryCount); // 等待时间递增 + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + } else { + log.info("成功解析到数据"); + } + } + + return success; + } + /** + * 检查指定日期的数据文件是否存在且包含数据 + */ + private boolean isDateDataExists(String date) { + try { + List data = lotteryResultRepository.findByTimeContaining(date); + return data != null && !data.isEmpty(); + } catch (Exception e) { + log.warn("检查文件失败: " + e.getMessage()); + return false; + } + } + } diff --git a/src/main/java/com/tem/bocai/util/DateUtils.java b/src/main/java/com/tem/bocai/util/DateUtils.java index efcacf3..01605f5 100644 --- a/src/main/java/com/tem/bocai/util/DateUtils.java +++ b/src/main/java/com/tem/bocai/util/DateUtils.java @@ -72,6 +72,22 @@ public class DateUtils extends org.apache.commons.lang3.time.DateUtils return targetDate.format(DATE_FORMATTER); }*/ + // 近7天日期的方法 + public static List getLast7Days() { + List dateList = new ArrayList<>(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + Calendar calendar = Calendar.getInstance(); + + // 从今天开始,往前推7天 + for (int i = 0; i < 7; i++) { + dateList.add(sdf.format(calendar.getTime())); + calendar.add(Calendar.DAY_OF_YEAR, -1); + } + + return dateList; + } + + public static void main(String[] args) { System.out.println("====="+getTodayDate()); /* Date now = new Date(); // 当前时间 diff --git a/src/main/java/com/tem/bocai/util/LotteryHistoryCrawler.java b/src/main/java/com/tem/bocai/util/LotteryHistoryCrawler.java new file mode 100644 index 0000000..f132db0 --- /dev/null +++ b/src/main/java/com/tem/bocai/util/LotteryHistoryCrawler.java @@ -0,0 +1,332 @@ +package com.tem.bocai.util; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import lombok.extern.slf4j.Slf4j; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.Selectable; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +//开奖的历史结果 +@Slf4j +public class LotteryHistoryCrawler implements PageProcessor { + + private final String token; + // 站点配置 + private Site site; + // final LoginService loginService; + // 添加一个字段标记是否成功解析数据 + private static volatile boolean lastParseSuccess = true; + + private String path; + private String date; + + public LotteryHistoryCrawler(String token, String path,String date) { + this.token = token; + this.path =path; + this.date =date; + initSite(); + } + + /** + * 初始化Site配置 + */ + private void initSite() { + site = Site.me() + .setRetryTimes(3) + .setSleepTime(1000) + .setTimeOut(10000) + .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36"); + + // 设置cookie + if (token != null && !token.isEmpty()) { + site.addHeader("cookie", "token=" + token); + } + } + + + @Override + public void process(Page page) { + // 获取页面HTML + Html html = page.getHtml(); + + // 打印页面基本信息 + log.info("页面URL: " + page.getUrl()); + log.info("页面标题: " + html.xpath("//title/text()").get()); + // 示例:提取所有表格数据 + Selectable tables = html.xpath("//table"); + log.info("找到 " + tables.nodes().size() + " 个表格"); + if(tables.nodes().isEmpty()){ + lastParseSuccess = false; + }else { + lastParseSuccess = true; + } + // 提取表格数据(根据实际页面结构调整选择器) + extractTableData(html); + + // 示例:提取所有链接 + Selectable links = html.links(); + System.out.println("页面包含 " + links.all().size() + " 个链接"); + // 如果需要继续爬取其他页面 + // page.addTargetRequests(links.all()); + + // 将数据存入结果 + /* page.putField("html========", html.toString()); + page.putField("title", html.xpath("//title/text()").get());*/ + parseLotteryHtml(html.toString()); + + } + + private void extractTableData(Html html) { + // 根据实际页面结构编写数据提取逻辑 + // 示例:提取所有tr元素 + Selectable rows = html.xpath("//tr"); + for (Selectable row : rows.nodes()) { + // 提取每行的td内容 + String rowText = row.xpath("//td/text()").all().toString(); + if (!rowText.isEmpty()) { + System.out.println("行数据: " + rowText); + } + } + } + + @Override + public Site getSite() { + + return site; + } + + /** + * 添加一个方法获取解析状态 + */ + public static boolean isLastParseSuccess() { + return lastParseSuccess; + } + /** + * 解析彩票HTML数据,转换成指定的List>格式 + * + * @param htmlContent 爬取到的HTML文本内容 + * @return 解析后的结构化数据列表 + */ + public List> parseLotteryHtml(String htmlContent) { + List> resultList = new ArrayList<>(); + + // 初始化Jsoup解析器 + Document doc = Jsoup.parse(htmlContent); + + // 定位到数据所在的表格行(drawTable下的table > tbody > tr) + Element targetTable = doc.selectFirst("#drawTable"); + if (targetTable == null) { + return resultList; + } + + Elements trList = targetTable.select("table > tbody > tr"); + + // 遍历每一行数据 + for (Element tr : trList) { + Map rowData = new HashMap<>(); + + // 1. 提取期数(id) + Element periodTd = tr.selectFirst("td.period"); + rowData.put("id", periodTd != null ? periodTd.text().trim() : ""); + + // 2. 提取开奖时间(time) + Element timeTd = tr.selectFirst("td.drawTime"); + rowData.put("time", timeTd != null ? timeTd.text().trim() : ""); + + // 3. 提取开出号码(result)- 10个ballname的数字 + Elements ballTds = tr.select("td.ballname"); + List resultNumbers = new ArrayList<>(); + int count = 0; + for (Element td : ballTds) { + if (count >= 10) break; + String text = td.text().trim(); + if (text.matches("\\d+")) { + resultNumbers.add(Integer.parseInt(text)); + count++; + } + } + rowData.put("result", resultNumbers); + + // 4. 提取winner(other1) + Element winnerTd = tr.selectFirst("td.other1"); + if (winnerTd != null) { + String winnerText = winnerTd.text().trim(); + if (winnerText.matches("\\d+")) { + rowData.put("winner", Integer.parseInt(winnerText)); + } else { + rowData.put("winner", ""); + } + } else { + rowData.put("winner", ""); + } + + // 5. 提取GD1(冠亚小/大)、GD2(冠亚单/双) + Elements otherTds = tr.select("td.other"); + String gd1 = ""; + String gd2 = ""; + for (Element td : otherTds) { + String className = td.className(); + if (className.contains("GDX")) { + gd1 = td.text().trim(); + } else if (className.contains("GDS")) { + gd2 = td.text().trim(); + } + } + rowData.put("GD1", gd1); + rowData.put("GD2", gd2); + + // 6. 提取sum1(dldhl_sum)、sum2(dldhh_sum) + Element sum1Td = tr.selectFirst("td.dldhl_sum"); + if (sum1Td != null) { + String sum1Text = sum1Td.text().trim(); + if (sum1Text.matches("\\d+")) { + rowData.put("sum1", Integer.parseInt(sum1Text)); + } else { + rowData.put("sum1", ""); + } + } else { + rowData.put("sum1", ""); + } + + Element sum2Td = tr.selectFirst("td.dldhh_sum"); + if (sum2Td != null) { + String sum2Text = sum2Td.text().trim(); + if (sum2Text.matches("\\d+")) { + rowData.put("sum2", Integer.parseInt(sum2Text)); + } else { + rowData.put("sum2", ""); + } + } else { + rowData.put("sum2", ""); + } + + // 7. 提取GLH_result(龙虎结果,5个GLH开头的td) + List glhResults = new ArrayList<>(); + int glhCount = 0; + for (Element td : otherTds) { + if (glhCount >= 5) break; + String className = td.className(); + if (className.contains("GLH_")) { + glhResults.add(td.text().trim()); + glhCount++; + } + } + rowData.put("GLH_result", glhResults); + + // 将单行数据加入结果列表(只保留有期数的有效行) + if (!rowData.get("id").toString().isEmpty()) { + resultList.add(rowData); + } + } + // 将数据写入SQLite数据库 + SQLiteUtil.writeToSQLite(resultList); + // 将数据写入JSON文件(保留原有功能) + writeToJsonFile(resultList); + log.info("历史爬虫打印结果===" + resultList); + return resultList; + } + + public void writeToJsonFile(List> resultList) { + try { + // 创建 ObjectMapper 实例 + ObjectMapper objectMapper = new ObjectMapper(); + + // 设置 JSON 格式化(可选,更易读) + objectMapper.enable(SerializationFeature.INDENT_OUTPUT); + + // 定义输出目录 + String directoryPath = path+"/current_data"; // 项目根目录下的 output/json 文件夹 + + // 使用年月日作为文件名(格式:result_yyyyMMdd.json) + String fileName = "result_" + date + ".json"; + String filePath = directoryPath + "/" + fileName; + + // 创建目录(如果不存在) + File directory = new File(directoryPath); + if (!directory.exists()) { + directory.mkdirs(); // 创建多级目录 + } + + // 创建文件对象 + File outputFile = new File(filePath); + + // 如果文件已存在,读取现有数据并对比 + List> existingData = new ArrayList<>(); + Set existingIds = new HashSet<>(); + if (outputFile.exists()) { + try { + existingData = objectMapper.readValue(outputFile, + objectMapper.getTypeFactory().constructCollectionType(List.class, Map.class)); + for (Map item : existingData) { + if (item.containsKey("id")) { + existingIds.add(item.get("id").toString()); + } + } + log.info("已读取现有数据,共 " + existingData.size() + " 条记录"); + } catch (IOException e) { + log.warn("读取现有文件失败,将覆盖写入: " + e.getMessage()); + existingIds.clear(); + } + } + + // 筛选出新增的数据(id不在existingIds中的记录) + List> newData = new ArrayList<>(); + for (Map item : resultList) { + if (item.containsKey("id")) { + String id = item.get("id").toString(); + if (!existingIds.contains(id)) { + newData.add(item); + } + } + } + + // 合并现有数据和新数据 + List> finalData = new ArrayList<>(); + if (!existingData.isEmpty()) { + finalData.addAll(existingData); + } + finalData.addAll(newData); + + // 将合并后的数据写入 JSON 文件 + objectMapper.writeValue(outputFile, finalData); + log.info("数据已成功写入文件: " + outputFile.getAbsolutePath() + + " (现有: " + existingData.size() + " 条, 新增: " + newData.size() + " 条, 总计: " + finalData.size() + " 条)"); + } catch (IOException e) { + e.printStackTrace(); + log.error("写入 JSON 文件失败: " + e.getMessage(), e); + throw new RuntimeException("写入 JSON 文件失败: " + e.getMessage(), e); + } + } + + + public static void main(String[] args) { + String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-02-06"; + + // 创建爬虫 + Spider.create(new LotteryHistoryCrawler("","","")) + .addUrl(url) // 添加起始URL + .thread(1) // 线程数 + .run(); // 开始爬取 + } + + + // 自定义headers + /*private Map getHeaders() { + Map headers = new HashMap<>(); + headers.put("cookie", "token=a1b219fe7e39374d6af532c56fdc911b76ae8f83"); + + return headers; + }*/ +} \ No newline at end of file