281 lines
9.6 KiB
Java
281 lines
9.6 KiB
Java
package com.tem.bocai.util;
|
||
|
||
import com.fasterxml.jackson.databind.SerializationFeature;
|
||
import org.jsoup.Jsoup;
|
||
import org.jsoup.nodes.Document;
|
||
import org.jsoup.nodes.Element;
|
||
import org.jsoup.select.Elements;
|
||
import us.codecraft.webmagic.Page;
|
||
import us.codecraft.webmagic.Site;
|
||
import us.codecraft.webmagic.Spider;
|
||
import us.codecraft.webmagic.processor.PageProcessor;
|
||
import us.codecraft.webmagic.selector.Html;
|
||
import us.codecraft.webmagic.selector.Selectable;
|
||
|
||
import java.util.ArrayList;
|
||
import java.util.HashMap;
|
||
import java.util.List;
|
||
import java.util.Map;
|
||
|
||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||
|
||
import java.io.File;
|
||
import java.io.IOException;
|
||
import java.util.*;
|
||
|
||
//开奖的历史结果
|
||
public class LotteryWebMagicCrawler implements PageProcessor {
|
||
|
||
private final String token;
|
||
// 站点配置
|
||
private Site site;
|
||
// final LoginService loginService;
|
||
|
||
|
||
public LotteryWebMagicCrawler(String token) {
|
||
this.token = token;
|
||
initSite();
|
||
}
|
||
|
||
/**
|
||
* 初始化Site配置
|
||
*/
|
||
private void initSite() {
|
||
site = Site.me()
|
||
.setRetryTimes(3)
|
||
.setSleepTime(1000)
|
||
.setTimeOut(10000)
|
||
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36");
|
||
|
||
// 设置cookie
|
||
if (token != null && !token.isEmpty()) {
|
||
site.addHeader("cookie", "token=" + token);
|
||
}
|
||
}
|
||
|
||
|
||
@Override
|
||
public void process(Page page) {
|
||
// 获取页面HTML
|
||
Html html = page.getHtml();
|
||
|
||
// 打印页面基本信息
|
||
System.out.println("页面URL: " + page.getUrl());
|
||
System.out.println("页面标题: " + html.xpath("//title/text()").get());
|
||
|
||
// 示例:提取所有表格数据
|
||
Selectable tables = html.xpath("//table");
|
||
System.out.println("找到 " + tables.nodes().size() + " 个表格");
|
||
|
||
// 提取表格数据(根据实际页面结构调整选择器)
|
||
extractTableData(html);
|
||
|
||
// 示例:提取所有链接
|
||
Selectable links = html.links();
|
||
System.out.println("页面包含 " + links.all().size() + " 个链接");
|
||
|
||
// 如果需要继续爬取其他页面
|
||
// page.addTargetRequests(links.all());
|
||
|
||
// 将数据存入结果
|
||
/* page.putField("html========", html.toString());
|
||
page.putField("title", html.xpath("//title/text()").get());*/
|
||
parseLotteryHtml(html.toString());
|
||
|
||
}
|
||
|
||
private void extractTableData(Html html) {
|
||
// 根据实际页面结构编写数据提取逻辑
|
||
// 示例:提取所有tr元素
|
||
Selectable rows = html.xpath("//tr");
|
||
for (Selectable row : rows.nodes()) {
|
||
// 提取每行的td内容
|
||
String rowText = row.xpath("//td/text()").all().toString();
|
||
if (!rowText.isEmpty()) {
|
||
System.out.println("行数据: " + rowText);
|
||
}
|
||
}
|
||
}
|
||
|
||
@Override
|
||
public Site getSite() {
|
||
|
||
return site;
|
||
}
|
||
|
||
|
||
/**
|
||
* 解析彩票HTML数据,转换成指定的List<Map<String, Object>>格式
|
||
*
|
||
* @param htmlContent 爬取到的HTML文本内容
|
||
* @return 解析后的结构化数据列表
|
||
*/
|
||
public static List<Map<String, Object>> parseLotteryHtml(String htmlContent) {
|
||
List<Map<String, Object>> resultList = new ArrayList<>();
|
||
|
||
// 初始化Jsoup解析器
|
||
Document doc = Jsoup.parse(htmlContent);
|
||
|
||
// 定位到数据所在的表格行(drawTable下的table > tbody > tr)
|
||
Element targetTable = doc.selectFirst("#drawTable");
|
||
if (targetTable == null) {
|
||
return resultList;
|
||
}
|
||
|
||
Elements trList = targetTable.select("table > tbody > tr");
|
||
|
||
// 遍历每一行数据
|
||
for (Element tr : trList) {
|
||
Map<String, Object> rowData = new HashMap<>();
|
||
|
||
// 1. 提取期数(id)
|
||
Element periodTd = tr.selectFirst("td.period");
|
||
rowData.put("id", periodTd != null ? periodTd.text().trim() : "");
|
||
|
||
// 2. 提取开奖时间(time)
|
||
Element timeTd = tr.selectFirst("td.drawTime");
|
||
rowData.put("time", timeTd != null ? timeTd.text().trim() : "");
|
||
|
||
// 3. 提取开出号码(result)- 10个ballname的数字
|
||
Elements ballTds = tr.select("td.ballname");
|
||
List<Integer> resultNumbers = new ArrayList<>();
|
||
int count = 0;
|
||
for (Element td : ballTds) {
|
||
if (count >= 10) break;
|
||
String text = td.text().trim();
|
||
if (text.matches("\\d+")) {
|
||
resultNumbers.add(Integer.parseInt(text));
|
||
count++;
|
||
}
|
||
}
|
||
rowData.put("result", resultNumbers);
|
||
|
||
// 4. 提取winner(other1)
|
||
Element winnerTd = tr.selectFirst("td.other1");
|
||
if (winnerTd != null) {
|
||
String winnerText = winnerTd.text().trim();
|
||
if (winnerText.matches("\\d+")) {
|
||
rowData.put("winner", Integer.parseInt(winnerText));
|
||
} else {
|
||
rowData.put("winner", "");
|
||
}
|
||
} else {
|
||
rowData.put("winner", "");
|
||
}
|
||
|
||
// 5. 提取GD1(冠亚小/大)、GD2(冠亚单/双)
|
||
Elements otherTds = tr.select("td.other");
|
||
String gd1 = "";
|
||
String gd2 = "";
|
||
for (Element td : otherTds) {
|
||
String className = td.className();
|
||
if (className.contains("GDX")) {
|
||
gd1 = td.text().trim();
|
||
} else if (className.contains("GDS")) {
|
||
gd2 = td.text().trim();
|
||
}
|
||
}
|
||
rowData.put("GD1", gd1);
|
||
rowData.put("GD2", gd2);
|
||
|
||
// 6. 提取sum1(dldhl_sum)、sum2(dldhh_sum)
|
||
Element sum1Td = tr.selectFirst("td.dldhl_sum");
|
||
if (sum1Td != null) {
|
||
String sum1Text = sum1Td.text().trim();
|
||
if (sum1Text.matches("\\d+")) {
|
||
rowData.put("sum1", Integer.parseInt(sum1Text));
|
||
} else {
|
||
rowData.put("sum1", "");
|
||
}
|
||
} else {
|
||
rowData.put("sum1", "");
|
||
}
|
||
|
||
Element sum2Td = tr.selectFirst("td.dldhh_sum");
|
||
if (sum2Td != null) {
|
||
String sum2Text = sum2Td.text().trim();
|
||
if (sum2Text.matches("\\d+")) {
|
||
rowData.put("sum2", Integer.parseInt(sum2Text));
|
||
} else {
|
||
rowData.put("sum2", "");
|
||
}
|
||
} else {
|
||
rowData.put("sum2", "");
|
||
}
|
||
|
||
// 7. 提取GLH_result(龙虎结果,5个GLH开头的td)
|
||
List<String> glhResults = new ArrayList<>();
|
||
int glhCount = 0;
|
||
for (Element td : otherTds) {
|
||
if (glhCount >= 5) break;
|
||
String className = td.className();
|
||
if (className.contains("GLH_")) {
|
||
glhResults.add(td.text().trim());
|
||
glhCount++;
|
||
}
|
||
}
|
||
rowData.put("GLH_result", glhResults);
|
||
|
||
// 将单行数据加入结果列表(只保留有期数的有效行)
|
||
if (!rowData.get("id").toString().isEmpty()) {
|
||
resultList.add(rowData);
|
||
}
|
||
}
|
||
// 将数据写入SQLite数据库
|
||
SQLiteUtil.writeToSQLite(resultList);
|
||
// 将数据写入JSON文件(保留原有功能)
|
||
writeToJsonFile(resultList);
|
||
System.out.println("打印结果===" + resultList);
|
||
return resultList;
|
||
}
|
||
|
||
public static void writeToJsonFile(List<Map<String, Object>> resultList) {
|
||
try {
|
||
// 创建 ObjectMapper 实例
|
||
ObjectMapper objectMapper = new ObjectMapper();
|
||
|
||
// 设置 JSON 格式化(可选,更易读)
|
||
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
|
||
|
||
// 定义输出目录和文件名
|
||
String directoryPath = "output/json"; // 项目根目录下的 output/json 文件夹
|
||
String fileName = "result_" + System.currentTimeMillis() + ".json";
|
||
String filePath = directoryPath + "/" + fileName;
|
||
|
||
// 创建目录(如果不存在)
|
||
File directory = new File(directoryPath);
|
||
if (!directory.exists()) {
|
||
directory.mkdirs(); // 创建多级目录
|
||
}
|
||
// 创建文件对象
|
||
File outputFile = new File(filePath);
|
||
// 将 List 写入 JSON 文件
|
||
objectMapper.writeValue(outputFile, resultList);
|
||
System.out.println("数据已成功写入文件: " + outputFile.getAbsolutePath());
|
||
|
||
} catch (IOException e) {
|
||
e.printStackTrace();
|
||
throw new RuntimeException("写入 JSON 文件失败: " + e.getMessage(), e);
|
||
}
|
||
}
|
||
|
||
|
||
public static void main(String[] args) {
|
||
String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-01-18";
|
||
|
||
// 创建爬虫
|
||
Spider.create(new LotteryWebMagicCrawler(""))
|
||
.addUrl(url) // 添加起始URL
|
||
.thread(1) // 线程数
|
||
.run(); // 开始爬取
|
||
}
|
||
|
||
|
||
// 自定义headers
|
||
/*private Map<String, String> getHeaders() {
|
||
Map<String, String> headers = new HashMap<>();
|
||
headers.put("cookie", "token=a1b219fe7e39374d6af532c56fdc911b76ae8f83");
|
||
|
||
return headers;
|
||
}*/
|
||
} |