Files
bocai/src/main/java/com/tem/bocai/util/LotteryWebMagicCrawler.java
2026-01-21 16:28:53 +08:00

281 lines
9.6 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package com.tem.bocai.util;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.File;
import java.io.IOException;
import java.util.*;
//开奖的历史结果
public class LotteryWebMagicCrawler implements PageProcessor {
private final String token;
// 站点配置
private Site site;
// final LoginService loginService;
public LotteryWebMagicCrawler(String token) {
this.token = token;
initSite();
}
/**
* 初始化Site配置
*/
private void initSite() {
site = Site.me()
.setRetryTimes(3)
.setSleepTime(1000)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36");
// 设置cookie
if (token != null && !token.isEmpty()) {
site.addHeader("cookie", "token=" + token);
}
}
@Override
public void process(Page page) {
// 获取页面HTML
Html html = page.getHtml();
// 打印页面基本信息
System.out.println("页面URL: " + page.getUrl());
System.out.println("页面标题: " + html.xpath("//title/text()").get());
// 示例:提取所有表格数据
Selectable tables = html.xpath("//table");
System.out.println("找到 " + tables.nodes().size() + " 个表格");
// 提取表格数据(根据实际页面结构调整选择器)
extractTableData(html);
// 示例:提取所有链接
Selectable links = html.links();
System.out.println("页面包含 " + links.all().size() + " 个链接");
// 如果需要继续爬取其他页面
// page.addTargetRequests(links.all());
// 将数据存入结果
/* page.putField("html========", html.toString());
page.putField("title", html.xpath("//title/text()").get());*/
parseLotteryHtml(html.toString());
}
private void extractTableData(Html html) {
// 根据实际页面结构编写数据提取逻辑
// 示例提取所有tr元素
Selectable rows = html.xpath("//tr");
for (Selectable row : rows.nodes()) {
// 提取每行的td内容
String rowText = row.xpath("//td/text()").all().toString();
if (!rowText.isEmpty()) {
System.out.println("行数据: " + rowText);
}
}
}
@Override
public Site getSite() {
return site;
}
/**
* 解析彩票HTML数据转换成指定的List<Map<String, Object>>格式
*
* @param htmlContent 爬取到的HTML文本内容
* @return 解析后的结构化数据列表
*/
public static List<Map<String, Object>> parseLotteryHtml(String htmlContent) {
List<Map<String, Object>> resultList = new ArrayList<>();
// 初始化Jsoup解析器
Document doc = Jsoup.parse(htmlContent);
// 定位到数据所在的表格行drawTable下的table > tbody > tr
Element targetTable = doc.selectFirst("#drawTable");
if (targetTable == null) {
return resultList;
}
Elements trList = targetTable.select("table > tbody > tr");
// 遍历每一行数据
for (Element tr : trList) {
Map<String, Object> rowData = new HashMap<>();
// 1. 提取期数id
Element periodTd = tr.selectFirst("td.period");
rowData.put("id", periodTd != null ? periodTd.text().trim() : "");
// 2. 提取开奖时间time
Element timeTd = tr.selectFirst("td.drawTime");
rowData.put("time", timeTd != null ? timeTd.text().trim() : "");
// 3. 提取开出号码result- 10个ballname的数字
Elements ballTds = tr.select("td.ballname");
List<Integer> resultNumbers = new ArrayList<>();
int count = 0;
for (Element td : ballTds) {
if (count >= 10) break;
String text = td.text().trim();
if (text.matches("\\d+")) {
resultNumbers.add(Integer.parseInt(text));
count++;
}
}
rowData.put("result", resultNumbers);
// 4. 提取winnerother1
Element winnerTd = tr.selectFirst("td.other1");
if (winnerTd != null) {
String winnerText = winnerTd.text().trim();
if (winnerText.matches("\\d+")) {
rowData.put("winner", Integer.parseInt(winnerText));
} else {
rowData.put("winner", "");
}
} else {
rowData.put("winner", "");
}
// 5. 提取GD1冠亚小/大、GD2冠亚单/双)
Elements otherTds = tr.select("td.other");
String gd1 = "";
String gd2 = "";
for (Element td : otherTds) {
String className = td.className();
if (className.contains("GDX")) {
gd1 = td.text().trim();
} else if (className.contains("GDS")) {
gd2 = td.text().trim();
}
}
rowData.put("GD1", gd1);
rowData.put("GD2", gd2);
// 6. 提取sum1dldhl_sum、sum2dldhh_sum
Element sum1Td = tr.selectFirst("td.dldhl_sum");
if (sum1Td != null) {
String sum1Text = sum1Td.text().trim();
if (sum1Text.matches("\\d+")) {
rowData.put("sum1", Integer.parseInt(sum1Text));
} else {
rowData.put("sum1", "");
}
} else {
rowData.put("sum1", "");
}
Element sum2Td = tr.selectFirst("td.dldhh_sum");
if (sum2Td != null) {
String sum2Text = sum2Td.text().trim();
if (sum2Text.matches("\\d+")) {
rowData.put("sum2", Integer.parseInt(sum2Text));
} else {
rowData.put("sum2", "");
}
} else {
rowData.put("sum2", "");
}
// 7. 提取GLH_result龙虎结果5个GLH开头的td
List<String> glhResults = new ArrayList<>();
int glhCount = 0;
for (Element td : otherTds) {
if (glhCount >= 5) break;
String className = td.className();
if (className.contains("GLH_")) {
glhResults.add(td.text().trim());
glhCount++;
}
}
rowData.put("GLH_result", glhResults);
// 将单行数据加入结果列表(只保留有期数的有效行)
if (!rowData.get("id").toString().isEmpty()) {
resultList.add(rowData);
}
}
// 将数据写入SQLite数据库
SQLiteUtil.writeToSQLite(resultList);
// 将数据写入JSON文件保留原有功能
writeToJsonFile(resultList);
System.out.println("打印结果===" + resultList);
return resultList;
}
public static void writeToJsonFile(List<Map<String, Object>> resultList) {
try {
// 创建 ObjectMapper 实例
ObjectMapper objectMapper = new ObjectMapper();
// 设置 JSON 格式化(可选,更易读)
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
// 定义输出目录和文件名
String directoryPath = "output/json"; // 项目根目录下的 output/json 文件夹
String fileName = "result_" + System.currentTimeMillis() + ".json";
String filePath = directoryPath + "/" + fileName;
// 创建目录(如果不存在)
File directory = new File(directoryPath);
if (!directory.exists()) {
directory.mkdirs(); // 创建多级目录
}
// 创建文件对象
File outputFile = new File(filePath);
// 将 List 写入 JSON 文件
objectMapper.writeValue(outputFile, resultList);
System.out.println("数据已成功写入文件: " + outputFile.getAbsolutePath());
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("写入 JSON 文件失败: " + e.getMessage(), e);
}
}
public static void main(String[] args) {
String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-01-18";
// 创建爬虫
Spider.create(new LotteryWebMagicCrawler(""))
.addUrl(url) // 添加起始URL
.thread(1) // 线程数
.run(); // 开始爬取
}
// 自定义headers
/*private Map<String, String> getHeaders() {
Map<String, String> headers = new HashMap<>();
headers.put("cookie", "token=a1b219fe7e39374d6af532c56fdc911b76ae8f83");
return headers;
}*/
}