This commit is contained in:
xuelijun
2026-01-21 13:40:15 +08:00
parent e4a2905d73
commit 20d7538333
10 changed files with 1233 additions and 11 deletions

23
pom.xml
View File

@@ -55,6 +55,29 @@
<artifactId>tess4j</artifactId>
<version>5.18.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>1.0.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>1.0.3</version>
</dependency>
<!-- 可选的用于JSON处理 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>2.0.46</version>
</dependency>
<!-- <dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>5.8.0</version>
</dependency>-->
</dependencies>
<build>
<plugins>

View File

@@ -0,0 +1,29 @@
package com.tem.bocai.controller;
import com.tem.bocai.service.LoginService;
import com.tem.bocai.util.ImageOcrService;
import net.sourceforge.tess4j.TesseractException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.IOException;
@RestController
public class LoginCrawler {
private final LoginService loginService;
// 构造函数注入
public LoginCrawler(LoginService loginService) {
this.loginService = loginService;
}
@GetMapping("/ocr/login")
public ResponseEntity<String> ocrLocalImage(String username, String password,String loginUrl,Integer winNum,Integer loseNum) throws IOException, TesseractException {
String result = loginService.loginAutomatic(username,password,loginUrl,winNum,loseNum);
return ResponseEntity.ok(result);
}
}

View File

@@ -29,9 +29,10 @@ public class TestController {
}
@GetMapping("/ocr/remote")
public ResponseEntity<String> ocrRemoteImage(String imageUrl) throws IOException, TesseractException {
String result = imageOcrService.ocrRemoteImage(imageUrl);
public ResponseEntity<String> ocrRemoteImage(String imageUrl) throws IOException, TesseractException, InterruptedException {
imageUrl = "https://4701268539-esh.qdk63ayw8g.com/code";
String result = imageOcrService.ocrRemoteImage();
System.out.println("++++"+result);
return ResponseEntity.ok(result);
}
}

View File

@@ -0,0 +1,10 @@
package com.tem.bocai.service;
public interface LoginService {
String loginAutomatic(String username, String password,String loginUrl,Integer winNum,Integer loseNum);
//获取token
String getToken(String username, String password, String loginUrl);
}

View File

@@ -0,0 +1,410 @@
package com.tem.bocai.service.impl;
import com.tem.bocai.service.LoginService;
import com.tem.bocai.util.LotteryDataPipeline;
import com.tem.bocai.util.LotteryWebMagicCrawler;
import org.springframework.stereotype.Service;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.*;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.springframework.beans.factory.annotation.Autowired;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.cookie.Cookie;
import us.codecraft.webmagic.Spider;
@Service
public class LoginServiceImpl implements LoginService {
private static final String BASE_URL = "https://4701268539-esh.qdk63ayw8g.com";
private static final int MAX_RETRY = 5;
@Autowired
private Tesseract tesseract;
@Override
public String loginAutomatic(String username, String password, String loginUrl, Integer winNum, Integer loseNum) {
String token = "";
for (int attempt = 1; attempt <= MAX_RETRY; attempt++) {
System.out.println("\n=== 第 " + attempt + " 次尝试 ===");
try {
token = attemptLogin();
if (token != null && !token.isEmpty()) {
// 2. 创建爬虫实例传入token
LotteryWebMagicCrawler crawler = new LotteryWebMagicCrawler(token);
// 3. 创建数据处理器
LotteryDataPipeline pipeline = new LotteryDataPipeline();
// 4. 执行爬虫
String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-01-18";
Spider.create(crawler)
.addUrl(url)
.addPipeline(pipeline)
.thread(1)
.run();
// 5. 返回爬取的数据
List<Map<String, Object>> result = pipeline.getLotteryData();
System.out.println("爬虫完成,获取到 " + result.size() + " 条数据");
System.out.println("===="+result);
return result.toString();
}
if (attempt < MAX_RETRY) {
waitForRetry(attempt);
}
} catch (Exception e) {
System.err.println("" + attempt + " 次尝试失败: " + e.getMessage());
e.printStackTrace();
}
}
return "";
}
@Override
public String getToken(String username, String password, String loginUrl) {
String token = "";
for (int attempt = 1; attempt <= MAX_RETRY; attempt++) {
System.out.println("\n=== 第 " + attempt + " 次尝试 ===");
try {
token = attemptLogin();
if (token != null && !token.isEmpty()) {
return token;
}
if (attempt < MAX_RETRY) {
waitForRetry(attempt);
}
} catch (Exception e) {
System.err.println("" + attempt + " 次尝试失败: " + e.getMessage());
e.printStackTrace();
}
}
return "";
}
/**
* 单次登录尝试
*/
private String attemptLogin() throws IOException, TesseractException, InterruptedException {
CookieStore cookieStore = new BasicCookieStore();
try (CloseableHttpClient httpClient = createHttpClient(cookieStore)) {
// 1. 获取验证码
byte[] imageData = fetchCaptcha(httpClient);
if (imageData == null) {
return null;
}
// 2. OCR识别验证码
String code = processCaptcha(imageData);
if (code == null || code.length() != 4) {
return null;
}
// 3. 执行登录
return performLogin(httpClient, cookieStore, code);
} catch (Exception e) {
throw new IOException("登录尝试失败", e);
}
}
/**
* 创建HttpClient
*/
private CloseableHttpClient createHttpClient(CookieStore cookieStore) {
return HttpClients.custom()
.setDefaultCookieStore(cookieStore)
.build();
}
/**
* 获取验证码图片
*/
private byte[] fetchCaptcha(CloseableHttpClient httpClient)
throws IOException, InterruptedException {
System.out.println("获取验证码...");
// 添加随机延迟
Thread.sleep(1000 + (long) (Math.random() * 1000));
HttpGet getCaptcha = new HttpGet(BASE_URL + "/code");
setCommonHeaders(getCaptcha);
getCaptcha.setHeader("Referer", BASE_URL + "/login");
try (CloseableHttpResponse captchaResponse = httpClient.execute(getCaptcha)) {
int captchaStatus = captchaResponse.getStatusLine().getStatusCode();
System.out.println("验证码响应状态码: " + captchaStatus);
if (captchaStatus == 200) {
return EntityUtils.toByteArray(captchaResponse.getEntity());
} else if (captchaStatus == 429) {
System.out.println("获取验证码被限速,等待后重试...");
Thread.sleep(3000);
} else {
System.out.println("获取验证码失败: " + captchaStatus);
}
}
return null;
}
/**
* 处理验证码识别
*/
private String processCaptcha(byte[] imageData)
throws IOException, TesseractException {
BufferedImage image = ImageIO.read(new ByteArrayInputStream(imageData));
String rawOcr = tesseract.doOCR(image);
// 清理验证码
String code = rawOcr.replaceAll("\\s+", "").trim();
code = code.replaceAll("[^0-9]", ""); // 只保留数字
System.out.println("OCR原始结果: " + rawOcr);
System.out.println("清理后验证码: [" + code + "] 长度: " + code.length());
// 保存图片用于调试
//saveCaptchaImage(image);
return code;
}
/**
* 保存验证码图片
*/
/* private void saveCaptchaImage(BufferedImage image) throws IOException {
String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
File output = new File("captcha_" + timestamp + ".png");
ImageIO.write(image, "png", output);
System.out.println("验证码图片已保存到: " + output.getAbsolutePath());
}*/
/**
* 执行登录请求
*/
private String performLogin(CloseableHttpClient httpClient,
CookieStore cookieStore,
String code) throws IOException, InterruptedException {
System.out.println("执行登录...");
// 等待一下再发送登录请求
Thread.sleep(1500 + (long) (Math.random() * 1000));
HttpPost loginPost = createLoginRequest(code);
try (CloseableHttpResponse loginResponse = httpClient.execute(loginPost)) {
return processLoginResponse(loginResponse, cookieStore);
}
}
/**
* 创建登录请求
*/
private HttpPost createLoginRequest(String code) throws UnsupportedEncodingException {
HttpPost loginPost = new HttpPost(BASE_URL + "/login");
// 设置请求头
setCommonHeaders(loginPost);
loginPost.setHeader("Referer", BASE_URL + "/login");
loginPost.setHeader("Origin", BASE_URL);
loginPost.setHeader("Accept", "application/json, text/plain, */*");
// 构建登录参数
List<NameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("type", "1"));
params.add(new BasicNameValuePair("account", "pmk1"));
params.add(new BasicNameValuePair("password", "Asd123123"));
params.add(new BasicNameValuePair("code", code));
loginPost.setEntity(new UrlEncodedFormEntity(params, "UTF-8"));
// 禁用自动重定向
RequestConfig requestConfig = RequestConfig.custom()
.setRedirectsEnabled(false)
.build();
loginPost.setConfig(requestConfig);
return loginPost;
}
/**
* 处理登录响应
*/
private String processLoginResponse(CloseableHttpResponse loginResponse,
CookieStore cookieStore) throws IOException, InterruptedException {
int statusCode = loginResponse.getStatusLine().getStatusCode();
System.out.println("登录响应状态码: " + statusCode);
// 处理限速
if (statusCode == 429) {
handleRateLimit(loginResponse);
return null;
}
// 打印响应头
printResponseHeaders(loginResponse);
// 检查重定向
if (statusCode == 302) {
if (checkRedirectForError(loginResponse)) {
return null;
}
}
// 读取响应体
String tokenFromBody = extractTokenFromResponseBody(loginResponse);
if (tokenFromBody != null) {
return tokenFromBody;
}
// 从cookies中提取token
return extractTokenFromCookies(cookieStore, statusCode);
}
/**
* 处理速率限制
*/
private void handleRateLimit(CloseableHttpResponse response) throws InterruptedException {
System.out.println("登录请求被限速 (429 Too Many Requests)");
Header retryAfterHeader = response.getFirstHeader("Retry-After");
if (retryAfterHeader != null) {
try {
int retryAfterSeconds = Integer.parseInt(retryAfterHeader.getValue());
System.out.println("服务器要求等待 " + retryAfterSeconds + "");
Thread.sleep(retryAfterSeconds * 1000L);
} catch (NumberFormatException e) {
System.out.println("等待5秒后重试");
Thread.sleep(5000);
}
} else {
System.out.println("等待3秒后重试");
Thread.sleep(3000);
}
}
/**
* 检查重定向是否包含错误
*/
private boolean checkRedirectForError(CloseableHttpResponse response) {
Header locationHeader = response.getFirstHeader("Location");
if (locationHeader != null) {
String location = locationHeader.getValue();
System.out.println("重定向到: " + location);
if (location.contains("e=3")) {
System.out.println("验证码错误 (e=3)");
return true;
}
}
return false;
}
/**
* 打印响应头
*/
private void printResponseHeaders(CloseableHttpResponse response) {
System.out.println("响应头:");
for (Header header : response.getAllHeaders()) {
System.out.println(" " + header.getName() + ": " + header.getValue());
}
}
/**
* 从响应体中提取token
*/
private String extractTokenFromResponseBody(CloseableHttpResponse response) throws IOException {
if (response.getEntity() != null) {
String responseBody = EntityUtils.toString(response.getEntity(), "UTF-8");
if (responseBody != null && !responseBody.isEmpty()) {
System.out.println("响应体: " + responseBody);
// 检查响应体中是否有tokenJSON格式
if (responseBody.contains("\"token\"")) {
// 简单提取token
int start = responseBody.indexOf("\"token\":\"");
if (start != -1) {
start += 9;
int end = responseBody.indexOf("\"", start);
if (end != -1) {
String token = responseBody.substring(start, end);
System.out.println("\n[SUCCESS] 从响应体找到Token!");
System.out.println("Token: " + token);
return token;
}
}
}
}
// 消耗实体
EntityUtils.consume(response.getEntity());
}
return null;
}
/**
* 从cookies中提取token
*/
private String extractTokenFromCookies(CookieStore cookieStore, int statusCode) {
List<Cookie> cookies = cookieStore.getCookies();
System.out.println("所有cookies (" + cookies.size() + "个):");
String token = null;
for (Cookie cookie : cookies) {
System.out.println(" " + cookie.getName() + " = " + cookie.getValue());
if ("token".equals(cookie.getName()) ||
cookie.getName().toLowerCase().contains("token")) {
token = cookie.getValue();
}
}
if (token != null && !token.isEmpty()) {
System.out.println("\n[SUCCESS] Login OK!");
System.out.println("Token: " + token);
return token;
} else if (statusCode == 200) {
System.out.println("登录返回200但没有找到token可能需要检查其他认证方式");
}
return null;
}
/**
* 设置通用请求头
*/
private void setCommonHeaders(HttpRequestBase request) {
request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
request.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
request.setHeader("Accept-Encoding", "gzip, deflate, br");
request.setHeader("Connection", "keep-alive");
request.setHeader("Upgrade-Insecure-Requests", "1");
}
/**
* 等待重试
*/
private void waitForRetry(int attempt) throws InterruptedException {
System.out.println("\n等待2秒后进行下一次尝试...");
Thread.sleep(2000);
}
}

View File

@@ -3,6 +3,16 @@ package com.tem.bocai.util;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.*;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.Resource;
import org.springframework.core.io.ResourceLoader;
@@ -11,16 +21,25 @@ import org.springframework.stereotype.Service;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.cookie.Cookie;
@Service
public class ImageOcrService {
private final Tesseract tesseract;
private final ResourceLoader resourceLoader;
private static CloseableHttpClient httpClient;
private static final String BASE_URL = "https://4701268539-esh.qdk63ayw8g.com";
@Autowired
public ImageOcrService(Tesseract tesseract, ResourceLoader resourceLoader) {
@@ -44,13 +63,220 @@ public class ImageOcrService {
/**
* 从远程 URL 获取图片并进行 OCR 处理
*
* @param imageUrl 图片 URL
* @return OCR 结果文本
*/
public String ocrRemoteImage(String imageUrl) throws IOException, TesseractException {
public String ocrRemoteImage() throws IOException, TesseractException, InterruptedException {
int maxRetry = 5;
byte[] imageData = IOUtils.toByteArray(new ByteArrayInputStream(IOUtils.toByteArray(imageUrl)));
for (int attempt = 1; attempt <= maxRetry; attempt++) {
System.out.println("\n=== 第 " + attempt + " 次尝试 ===");
// 每次尝试都创建新的HttpClient和CookieStore
CookieStore cookieStore = new BasicCookieStore();
CloseableHttpClient httpClient = HttpClients.custom()
.setDefaultCookieStore(cookieStore)
.build();
try {
// 1. 获取验证码
System.out.println("获取验证码...");
HttpGet getCaptcha = new HttpGet(BASE_URL + "/code");
setCommonHeaders(getCaptcha);
// 添加Referer头
getCaptcha.setHeader("Referer", BASE_URL + "/login");
// 添加随机延迟,避免请求过快
Thread.sleep(1000 + (long)(Math.random() * 1000));
CloseableHttpResponse captchaResponse = httpClient.execute(getCaptcha);
byte[] imageData = null;
try {
int captchaStatus = captchaResponse.getStatusLine().getStatusCode();
System.out.println("验证码响应状态码: " + captchaStatus);
if (captchaStatus == 200) {
imageData = EntityUtils.toByteArray(captchaResponse.getEntity());
} else if (captchaStatus == 429) {
System.out.println("获取验证码被限速,等待后重试...");
Thread.sleep(3000); // 等待3秒
continue; // 继续下一次尝试
} else {
System.out.println("获取验证码失败: " + captchaStatus);
continue;
}
} finally {
captchaResponse.close();
}
// 2. OCR识别验证码
String code = null;
if (imageData != null) {
BufferedImage image = ImageIO.read(new ByteArrayInputStream(imageData));
return tesseract.doOCR(image);
code = tesseract.doOCR(image);
// 清理验证码
code = code.replaceAll("\\s+", "").trim();
code = code.replaceAll("[^0-9]", ""); // 只保留数字
System.out.println("OCR原始结果: " + tesseract.doOCR(image));
System.out.println("清理后验证码: [" + code + "] 长度: " + code.length());
// 保存图片用于调试
File output = new File("captcha_attempt_" + attempt + ".png");
ImageIO.write(image, "png", output);
System.out.println("验证码图片已保存到: " + output.getAbsolutePath());
if (code.length() != 4) {
System.out.println("验证码长度不是4位跳过本次尝试");
continue;
}
} else {
System.out.println("验证码数据为空");
continue;
}
// 等待一下再发送登录请求
Thread.sleep(1500 + (long)(Math.random() * 1000));
// 3. 登录(不自动重定向)
System.out.println("执行登录...");
HttpPost loginPost = new HttpPost(BASE_URL + "/login");
setCommonHeaders(loginPost);
// 重要添加Referer和Origin头
loginPost.setHeader("Referer", BASE_URL + "/login");
loginPost.setHeader("Origin", BASE_URL);
loginPost.setHeader("Accept", "application/json, text/plain, */*");
// 构建登录参数
List<NameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("type", "1"));
params.add(new BasicNameValuePair("account", "pmk1"));
params.add(new BasicNameValuePair("password", "Asd123123"));
params.add(new BasicNameValuePair("code", code));
loginPost.setEntity(new UrlEncodedFormEntity(params, "UTF-8"));
// 禁用自动重定向
RequestConfig requestConfig = RequestConfig.custom()
.setRedirectsEnabled(false)
.build();
loginPost.setConfig(requestConfig);
CloseableHttpResponse loginResponse = httpClient.execute(loginPost);
try {
int statusCode = loginResponse.getStatusLine().getStatusCode();
System.out.println("登录响应状态码: " + statusCode);
// 处理429错误
if (statusCode == 429) {
System.out.println("登录请求被限速 (429 Too Many Requests)");
// 检查Retry-After头
Header retryAfterHeader = loginResponse.getFirstHeader("Retry-After");
if (retryAfterHeader != null) {
try {
int retryAfterSeconds = Integer.parseInt(retryAfterHeader.getValue());
System.out.println("服务器要求等待 " + retryAfterSeconds + "");
Thread.sleep(retryAfterSeconds * 1000L);
} catch (NumberFormatException e) {
System.out.println("等待5秒后重试");
Thread.sleep(5000);
}
} else {
System.out.println("等待3秒后重试");
Thread.sleep(3000);
}
continue; // 继续下一次尝试
}
// 打印响应头
System.out.println("响应头:");
for (Header header : loginResponse.getAllHeaders()) {
System.out.println(" " + header.getName() + ": " + header.getValue());
}
// 检查是否是重定向
if (statusCode == 302) {
Header locationHeader = loginResponse.getFirstHeader("Location");
if (locationHeader != null) {
String location = locationHeader.getValue();
System.out.println("重定向到: " + location);
if (location.contains("e=3")) {
System.out.println("验证码错误 (e=3)");
continue; // 继续下一次尝试
}
}
}
// 读取响应体(如果有)
if (loginResponse.getEntity() != null) {
String responseBody = EntityUtils.toString(loginResponse.getEntity(), "UTF-8");
if (responseBody != null && !responseBody.isEmpty()) {
System.out.println("响应体: " + responseBody);
// 检查响应体中是否有tokenJSON格式
if (responseBody.contains("\"token\"")) {
// 简单提取token
int start = responseBody.indexOf("\"token\":\"");
if (start != -1) {
start += 9;
int end = responseBody.indexOf("\"", start);
if (end != -1) {
String token = responseBody.substring(start, end);
System.out.println("\n[SUCCESS] 从响应体找到Token!");
System.out.println("Token: " + token);
return token;
}
}
}
}
// 消耗实体
EntityUtils.consume(loginResponse.getEntity());
}
// 4. 检查cookies中是否有token
String token = null;
List<Cookie> cookies = cookieStore.getCookies();
System.out.println("所有cookies (" + cookies.size() + "个):");
for (Cookie cookie : cookies) {
System.out.println(" " + cookie.getName() + " = " + cookie.getValue());
// 查找token
if ("token".equals(cookie.getName()) ||
cookie.getName().toLowerCase().contains("token")) {
token = cookie.getValue();
}
}
if (token != null && !token.isEmpty()) {
System.out.println("\n[SUCCESS] Login OK!");
System.out.println("Token: " + token);
return token;
} else if (statusCode == 200) {
// 如果是200状态码但没有token可能是登录成功但token在其他地方
System.out.println("登录返回200但没有找到token可能需要检查其他认证方式");
}
} finally {
loginResponse.close();
}
} finally {
httpClient.close();
}
// 如果不是最后一次尝试,等待一段时间
if (attempt < maxRetry) {
System.out.println("\n等待2秒后进行下一次尝试...");
Thread.sleep(2000);
}
}
System.out.println("\n[FAILED] " + maxRetry + " 次尝试都失败了");
return "";
}
private void setCommonHeaders(HttpRequestBase request) {
request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
request.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
request.setHeader("Accept-Encoding", "gzip, deflate, br");
request.setHeader("Connection", "keep-alive");
request.setHeader("Upgrade-Insecure-Requests", "1");
request.setHeader("Sec-Fetch-Dest", "document");
request.setHeader("Sec-Fetch-Mode", "navigate");
request.setHeader("Sec-Fetch-Site", "same-origin");
request.setHeader("Sec-Fetch-User", "?1");
}
}

View File

@@ -9,7 +9,7 @@ import java.util.Map;
public class ImageTest {
public static void main(String[] args) throws Exception {
getImageStreamAndCookie("https://4701268539-esh.qdk63ayw8g.com/code?_=1768901529986");
getImageStreamAndCookie("https://4701268539-esh.qdk63ayw8g.com/code");
}
/**

View File

@@ -0,0 +1,29 @@
package com.tem.bocai.util;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class LotteryDataPipeline implements Pipeline {
private List<Map<String, Object>> lotteryData = new ArrayList<>();
@Override
public void process(ResultItems resultItems, Task task) {
// 从ResultItems中获取数据
List<Map<String, Object>> data = resultItems.get("lotteryData");
if (data != null && !data.isEmpty()) {
lotteryData.addAll(data);
System.out.println("Pipeline处理数据: " + data.size() + "");
}
}
public List<Map<String, Object>> getLotteryData() {
return lotteryData;
}
}

View File

@@ -0,0 +1,277 @@
package com.tem.bocai.util;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.File;
import java.io.IOException;
import java.util.*;
public class LotteryWebMagicCrawler implements PageProcessor {
private final String token;
// 站点配置
private Site site;
// final LoginService loginService;
public LotteryWebMagicCrawler(String token) {
this.token = token;
initSite();
}
/**
* 初始化Site配置
*/
private void initSite() {
site = Site.me()
.setRetryTimes(3)
.setSleepTime(1000)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36");
// 设置cookie
if (token != null && !token.isEmpty()) {
site.addHeader("cookie", "token=" + token);
}
}
@Override
public void process(Page page) {
// 获取页面HTML
Html html = page.getHtml();
// 打印页面基本信息
System.out.println("页面URL: " + page.getUrl());
System.out.println("页面标题: " + html.xpath("//title/text()").get());
// 示例:提取所有表格数据
Selectable tables = html.xpath("//table");
System.out.println("找到 " + tables.nodes().size() + " 个表格");
// 提取表格数据(根据实际页面结构调整选择器)
extractTableData(html);
// 示例:提取所有链接
Selectable links = html.links();
System.out.println("页面包含 " + links.all().size() + " 个链接");
// 如果需要继续爬取其他页面
// page.addTargetRequests(links.all());
// 将数据存入结果
/* page.putField("html========", html.toString());
page.putField("title", html.xpath("//title/text()").get());*/
parseLotteryHtml(html.toString());
}
private void extractTableData(Html html) {
// 根据实际页面结构编写数据提取逻辑
// 示例提取所有tr元素
Selectable rows = html.xpath("//tr");
for (Selectable row : rows.nodes()) {
// 提取每行的td内容
String rowText = row.xpath("//td/text()").all().toString();
if (!rowText.isEmpty()) {
System.out.println("行数据: " + rowText);
}
}
}
@Override
public Site getSite() {
return site;
}
/**
* 解析彩票HTML数据转换成指定的List<Map<String, Object>>格式
*
* @param htmlContent 爬取到的HTML文本内容
* @return 解析后的结构化数据列表
*/
public static List<Map<String, Object>> parseLotteryHtml(String htmlContent) {
List<Map<String, Object>> resultList = new ArrayList<>();
// 初始化Jsoup解析器
Document doc = Jsoup.parse(htmlContent);
// 定位到数据所在的表格行drawTable下的table > tbody > tr
Element targetTable = doc.selectFirst("#drawTable");
if (targetTable == null) {
return resultList;
}
Elements trList = targetTable.select("table > tbody > tr");
// 遍历每一行数据
for (Element tr : trList) {
Map<String, Object> rowData = new HashMap<>();
// 1. 提取期数id
Element periodTd = tr.selectFirst("td.period");
rowData.put("id", periodTd != null ? periodTd.text().trim() : "");
// 2. 提取开奖时间time
Element timeTd = tr.selectFirst("td.drawTime");
rowData.put("time", timeTd != null ? timeTd.text().trim() : "");
// 3. 提取开出号码result- 10个ballname的数字
Elements ballTds = tr.select("td.ballname");
List<Integer> resultNumbers = new ArrayList<>();
int count = 0;
for (Element td : ballTds) {
if (count >= 10) break;
String text = td.text().trim();
if (text.matches("\\d+")) {
resultNumbers.add(Integer.parseInt(text));
count++;
}
}
rowData.put("result", resultNumbers);
// 4. 提取winnerother1
Element winnerTd = tr.selectFirst("td.other1");
if (winnerTd != null) {
String winnerText = winnerTd.text().trim();
if (winnerText.matches("\\d+")) {
rowData.put("winner", Integer.parseInt(winnerText));
} else {
rowData.put("winner", "");
}
} else {
rowData.put("winner", "");
}
// 5. 提取GD1冠亚小/大、GD2冠亚单/双)
Elements otherTds = tr.select("td.other");
String gd1 = "";
String gd2 = "";
for (Element td : otherTds) {
String className = td.className();
if (className.contains("GDX")) {
gd1 = td.text().trim();
} else if (className.contains("GDS")) {
gd2 = td.text().trim();
}
}
rowData.put("GD1", gd1);
rowData.put("GD2", gd2);
// 6. 提取sum1dldhl_sum、sum2dldhh_sum
Element sum1Td = tr.selectFirst("td.dldhl_sum");
if (sum1Td != null) {
String sum1Text = sum1Td.text().trim();
if (sum1Text.matches("\\d+")) {
rowData.put("sum1", Integer.parseInt(sum1Text));
} else {
rowData.put("sum1", "");
}
} else {
rowData.put("sum1", "");
}
Element sum2Td = tr.selectFirst("td.dldhh_sum");
if (sum2Td != null) {
String sum2Text = sum2Td.text().trim();
if (sum2Text.matches("\\d+")) {
rowData.put("sum2", Integer.parseInt(sum2Text));
} else {
rowData.put("sum2", "");
}
} else {
rowData.put("sum2", "");
}
// 7. 提取GLH_result龙虎结果5个GLH开头的td
List<String> glhResults = new ArrayList<>();
int glhCount = 0;
for (Element td : otherTds) {
if (glhCount >= 5) break;
String className = td.className();
if (className.contains("GLH_")) {
glhResults.add(td.text().trim());
glhCount++;
}
}
rowData.put("GLH_result", glhResults);
// 将单行数据加入结果列表(只保留有期数的有效行)
if (!rowData.get("id").toString().isEmpty()) {
resultList.add(rowData);
}
}
writeToJsonFile(resultList);
System.out.println("打印结果===" + resultList);
return resultList;
}
public static void writeToJsonFile(List<Map<String, Object>> resultList) {
try {
// 创建 ObjectMapper 实例
ObjectMapper objectMapper = new ObjectMapper();
// 设置 JSON 格式化(可选,更易读)
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
// 定义输出目录和文件名
String directoryPath = "output/json"; // 项目根目录下的 output/json 文件夹
String fileName = "result_" + System.currentTimeMillis() + ".json";
String filePath = directoryPath + "/" + fileName;
// 创建目录(如果不存在)
File directory = new File(directoryPath);
if (!directory.exists()) {
directory.mkdirs(); // 创建多级目录
}
// 创建文件对象
File outputFile = new File(filePath);
// 将 List 写入 JSON 文件
objectMapper.writeValue(outputFile, resultList);
System.out.println("数据已成功写入文件: " + outputFile.getAbsolutePath());
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("写入 JSON 文件失败: " + e.getMessage(), e);
}
}
public static void main(String[] args) {
String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-01-18";
// 创建爬虫
Spider.create(new LotteryWebMagicCrawler(""))
.addUrl(url) // 添加起始URL
.thread(1) // 线程数
.run(); // 开始爬取
}
// 自定义headers
/*private Map<String, String> getHeaders() {
Map<String, String> headers = new HashMap<>();
headers.put("cookie", "token=a1b219fe7e39374d6af532c56fdc911b76ae8f83");
return headers;
}*/
}

View File

@@ -0,0 +1,217 @@
package com.tem.bocai.util;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
@Component
public class TokenCacheManager {
private static final String CACHE_FILE_PATH = "token_cache.json";
private static final long TOKEN_EXPIRE_TIME = 1140000; // 19分钟过期毫秒 1140000 = 19 * 60 * 1000
private final ObjectMapper objectMapper = new ObjectMapper();
/**
* 保存token到缓存文件
*/
public void saveToken(String token) {
try {
Map<String, Object> cacheData = new HashMap<>();
cacheData.put("token", token);
cacheData.put("timestamp", System.currentTimeMillis());
cacheData.put("expireTime", TOKEN_EXPIRE_TIME);
cacheData.put("expireMinutes", 19); // 记录过期分钟数
String json = objectMapper.writeValueAsString(cacheData);
Files.write(Paths.get(CACHE_FILE_PATH), json.getBytes());
System.out.println("Token已保存到缓存文件: " + CACHE_FILE_PATH);
System.out.println("Token将在19分钟后过期");
} catch (IOException e) {
System.err.println("保存token缓存失败: " + e.getMessage());
}
}
/**
* 从缓存文件读取token
*/
public String readToken() {
try {
File cacheFile = new File(CACHE_FILE_PATH);
if (!cacheFile.exists()) {
System.out.println("缓存文件不存在");
return null;
}
String json = new String(Files.readAllBytes(Paths.get(CACHE_FILE_PATH)));
Map<String, Object> cacheData = objectMapper.readValue(json, Map.class);
String token = (String) cacheData.get("token");
Long timestamp = (Long) cacheData.get("timestamp");
if (!StringUtils.hasText(token) || timestamp == null) {
System.out.println("缓存数据不完整");
return null;
}
// 检查token是否过期
long currentTime = System.currentTimeMillis();
long elapsedTime = currentTime - timestamp;
if (elapsedTime > TOKEN_EXPIRE_TIME) {
long expiredSeconds = (elapsedTime - TOKEN_EXPIRE_TIME) / 1000;
System.out.println("Token已过期 " + expiredSeconds + "");
return null;
}
// 计算剩余时间
long remainingTime = TOKEN_EXPIRE_TIME - elapsedTime;
long remainingMinutes = remainingTime / 60000;
long remainingSeconds = (remainingTime % 60000) / 1000;
System.out.println("从缓存读取token剩余有效时间: " +
remainingMinutes + "" + remainingSeconds + "");
return token;
} catch (IOException e) {
System.err.println("读取token缓存失败: " + e.getMessage());
return null;
}
}
/**
* 清除token缓存
*/
public void clearToken() {
try {
File cacheFile = new File(CACHE_FILE_PATH);
if (cacheFile.exists()) {
Files.delete(Paths.get(CACHE_FILE_PATH));
System.out.println("Token缓存已清除");
}
} catch (IOException e) {
System.err.println("清除token缓存失败: " + e.getMessage());
}
}
/**
* 检查token是否存在且有效
*/
public boolean hasValidToken() {
return readToken() != null;
}
/**
* 检查token是否即将过期例如5分钟内过期
*/
public boolean isTokenExpiringSoon(int warningMinutes) {
try {
File cacheFile = new File(CACHE_FILE_PATH);
if (!cacheFile.exists()) {
return false;
}
String json = new String(Files.readAllBytes(Paths.get(CACHE_FILE_PATH)));
Map<String, Object> cacheData = objectMapper.readValue(json, Map.class);
Long timestamp = (Long) cacheData.get("timestamp");
if (timestamp == null) {
return false;
}
long currentTime = System.currentTimeMillis();
long elapsedTime = currentTime - timestamp;
long remainingTime = TOKEN_EXPIRE_TIME - elapsedTime;
// 检查是否在指定分钟内过期
return remainingTime > 0 && remainingTime <= (warningMinutes * 60000);
} catch (Exception e) {
return false;
}
}
/**
* 获取token信息用于调试
*/
public Map<String, Object> getTokenInfo() {
try {
File cacheFile = new File(CACHE_FILE_PATH);
if (!cacheFile.exists()) {
return null;
}
String json = new String(Files.readAllBytes(Paths.get(CACHE_FILE_PATH)));
Map<String, Object> cacheData = objectMapper.readValue(json, Map.class);
Long timestamp = (Long) cacheData.get("timestamp");
if (timestamp != null) {
long currentTime = System.currentTimeMillis();
long elapsedTime = currentTime - timestamp;
long remainingTime = TOKEN_EXPIRE_TIME - elapsedTime;
cacheData.put("savedTime", new Date(timestamp).toString());
cacheData.put("elapsedTime", formatTime(elapsedTime));
cacheData.put("remainingTime", formatTime(remainingTime));
cacheData.put("isValid", remainingTime > 0);
cacheData.put("expireMinutes", 19);
// 添加过期警告
if (remainingTime > 0) {
if (remainingTime <= 300000) { // 5分钟内过期
cacheData.put("warning", "Token将在5分钟内过期建议刷新");
} else if (remainingTime <= 600000) { // 10分钟内过期
cacheData.put("warning", "Token将在10分钟内过期");
}
}
}
return cacheData;
} catch (Exception e) {
return null;
}
}
/**
* 格式化时间显示
*/
private String formatTime(long millis) {
if (millis <= 0) {
return "已过期";
}
long minutes = millis / 60000;
long seconds = (millis % 60000) / 1000;
if (minutes > 0) {
return minutes + "" + seconds + "";
} else {
return seconds + "";
}
}
/**
* 获取token过期时间配置
*/
public static long getTokenExpireTime() {
return TOKEN_EXPIRE_TIME;
}
/**
* 获取token过期分钟数
*/
public static int getTokenExpireMinutes() {
return (int) (TOKEN_EXPIRE_TIME / 60000);
}
}