diff --git a/pom.xml b/pom.xml index 0710c2a..44d7c7c 100644 --- a/pom.xml +++ b/pom.xml @@ -55,6 +55,29 @@ tess4j 5.18.0 + + + us.codecraft + webmagic-core + 1.0.3 + + + + us.codecraft + webmagic-extension + 1.0.3 + + + + com.alibaba + fastjson + 2.0.46 + + diff --git a/src/main/java/com/tem/bocai/controller/LoginCrawler.java b/src/main/java/com/tem/bocai/controller/LoginCrawler.java new file mode 100644 index 0000000..3e19e89 --- /dev/null +++ b/src/main/java/com/tem/bocai/controller/LoginCrawler.java @@ -0,0 +1,29 @@ +package com.tem.bocai.controller; + +import com.tem.bocai.service.LoginService; +import com.tem.bocai.util.ImageOcrService; +import net.sourceforge.tess4j.TesseractException; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RestController; + +import java.io.IOException; +@RestController +public class LoginCrawler { + + private final LoginService loginService; + + // 构造函数注入 + public LoginCrawler(LoginService loginService) { + this.loginService = loginService; + } + @GetMapping("/ocr/login") + public ResponseEntity ocrLocalImage(String username, String password,String loginUrl,Integer winNum,Integer loseNum) throws IOException, TesseractException { + String result = loginService.loginAutomatic(username,password,loginUrl,winNum,loseNum); + return ResponseEntity.ok(result); + } + +} + + diff --git a/src/main/java/com/tem/bocai/controller/TestController.java b/src/main/java/com/tem/bocai/controller/TestController.java index 1b1a7d9..345a55a 100644 --- a/src/main/java/com/tem/bocai/controller/TestController.java +++ b/src/main/java/com/tem/bocai/controller/TestController.java @@ -29,9 +29,10 @@ public class TestController { } @GetMapping("/ocr/remote") - public ResponseEntity ocrRemoteImage(String imageUrl) throws IOException, TesseractException { - - String result = imageOcrService.ocrRemoteImage(imageUrl); + public ResponseEntity ocrRemoteImage(String imageUrl) throws IOException, TesseractException, InterruptedException { + imageUrl = "https://4701268539-esh.qdk63ayw8g.com/code"; + String result = imageOcrService.ocrRemoteImage(); + System.out.println("++++"+result); return ResponseEntity.ok(result); } } diff --git a/src/main/java/com/tem/bocai/service/LoginService.java b/src/main/java/com/tem/bocai/service/LoginService.java new file mode 100644 index 0000000..496828d --- /dev/null +++ b/src/main/java/com/tem/bocai/service/LoginService.java @@ -0,0 +1,10 @@ +package com.tem.bocai.service; + +public interface LoginService { + + + String loginAutomatic(String username, String password,String loginUrl,Integer winNum,Integer loseNum); + + //获取token + String getToken(String username, String password, String loginUrl); +} diff --git a/src/main/java/com/tem/bocai/service/impl/LoginServiceImpl.java b/src/main/java/com/tem/bocai/service/impl/LoginServiceImpl.java new file mode 100644 index 0000000..5dc4fef --- /dev/null +++ b/src/main/java/com/tem/bocai/service/impl/LoginServiceImpl.java @@ -0,0 +1,410 @@ +package com.tem.bocai.service.impl; + +import com.tem.bocai.service.LoginService; +import com.tem.bocai.util.LotteryDataPipeline; +import com.tem.bocai.util.LotteryWebMagicCrawler; +import org.springframework.stereotype.Service; +import net.sourceforge.tess4j.Tesseract; +import net.sourceforge.tess4j.TesseractException; +import org.apache.http.Header; +import org.apache.http.NameValuePair; +import org.apache.http.client.CookieStore; +import org.apache.http.client.entity.UrlEncodedFormEntity; +import org.apache.http.client.methods.*; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.message.BasicNameValuePair; +import org.apache.http.util.EntityUtils; +import org.springframework.beans.factory.annotation.Autowired; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; + +import org.apache.http.client.config.RequestConfig; +import org.apache.http.cookie.Cookie; +import us.codecraft.webmagic.Spider; + +@Service +public class LoginServiceImpl implements LoginService { + private static final String BASE_URL = "https://4701268539-esh.qdk63ayw8g.com"; + private static final int MAX_RETRY = 5; + @Autowired + private Tesseract tesseract; + + @Override + public String loginAutomatic(String username, String password, String loginUrl, Integer winNum, Integer loseNum) { + String token = ""; + for (int attempt = 1; attempt <= MAX_RETRY; attempt++) { + System.out.println("\n=== 第 " + attempt + " 次尝试 ==="); + try { + token = attemptLogin(); + if (token != null && !token.isEmpty()) { + // 2. 创建爬虫实例,传入token + LotteryWebMagicCrawler crawler = new LotteryWebMagicCrawler(token); + + // 3. 创建数据处理器 + LotteryDataPipeline pipeline = new LotteryDataPipeline(); + // 4. 执行爬虫 + String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-01-18"; + + Spider.create(crawler) + .addUrl(url) + .addPipeline(pipeline) + .thread(1) + .run(); + + // 5. 返回爬取的数据 + List> result = pipeline.getLotteryData(); + System.out.println("爬虫完成,获取到 " + result.size() + " 条数据"); + System.out.println("===="+result); + + return result.toString(); + } + + if (attempt < MAX_RETRY) { + waitForRetry(attempt); + } + } catch (Exception e) { + System.err.println("第 " + attempt + " 次尝试失败: " + e.getMessage()); + e.printStackTrace(); + } + } + return ""; + } + + + @Override + public String getToken(String username, String password, String loginUrl) { + String token = ""; + for (int attempt = 1; attempt <= MAX_RETRY; attempt++) { + System.out.println("\n=== 第 " + attempt + " 次尝试 ==="); + try { + token = attemptLogin(); + if (token != null && !token.isEmpty()) { + return token; + } + if (attempt < MAX_RETRY) { + waitForRetry(attempt); + } + } catch (Exception e) { + System.err.println("第 " + attempt + " 次尝试失败: " + e.getMessage()); + e.printStackTrace(); + } + } + return ""; + } + + /** + * 单次登录尝试 + */ + private String attemptLogin() throws IOException, TesseractException, InterruptedException { + CookieStore cookieStore = new BasicCookieStore(); + try (CloseableHttpClient httpClient = createHttpClient(cookieStore)) { + // 1. 获取验证码 + byte[] imageData = fetchCaptcha(httpClient); + if (imageData == null) { + return null; + } + // 2. OCR识别验证码 + String code = processCaptcha(imageData); + if (code == null || code.length() != 4) { + return null; + } + // 3. 执行登录 + return performLogin(httpClient, cookieStore, code); + + } catch (Exception e) { + throw new IOException("登录尝试失败", e); + } + } + + /** + * 创建HttpClient + */ + private CloseableHttpClient createHttpClient(CookieStore cookieStore) { + return HttpClients.custom() + .setDefaultCookieStore(cookieStore) + .build(); + } + + /** + * 获取验证码图片 + */ + private byte[] fetchCaptcha(CloseableHttpClient httpClient) + throws IOException, InterruptedException { + + System.out.println("获取验证码..."); + + // 添加随机延迟 + Thread.sleep(1000 + (long) (Math.random() * 1000)); + + HttpGet getCaptcha = new HttpGet(BASE_URL + "/code"); + setCommonHeaders(getCaptcha); + getCaptcha.setHeader("Referer", BASE_URL + "/login"); + + try (CloseableHttpResponse captchaResponse = httpClient.execute(getCaptcha)) { + int captchaStatus = captchaResponse.getStatusLine().getStatusCode(); + System.out.println("验证码响应状态码: " + captchaStatus); + + if (captchaStatus == 200) { + return EntityUtils.toByteArray(captchaResponse.getEntity()); + } else if (captchaStatus == 429) { + System.out.println("获取验证码被限速,等待后重试..."); + Thread.sleep(3000); + } else { + System.out.println("获取验证码失败: " + captchaStatus); + } + } + + return null; + } + + /** + * 处理验证码识别 + */ + private String processCaptcha(byte[] imageData) + throws IOException, TesseractException { + + BufferedImage image = ImageIO.read(new ByteArrayInputStream(imageData)); + String rawOcr = tesseract.doOCR(image); + + // 清理验证码 + String code = rawOcr.replaceAll("\\s+", "").trim(); + code = code.replaceAll("[^0-9]", ""); // 只保留数字 + System.out.println("OCR原始结果: " + rawOcr); + System.out.println("清理后验证码: [" + code + "] 长度: " + code.length()); + // 保存图片用于调试 + //saveCaptchaImage(image); + + return code; + } + + /** + * 保存验证码图片 + */ +/* private void saveCaptchaImage(BufferedImage image) throws IOException { + String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date()); + File output = new File("captcha_" + timestamp + ".png"); + ImageIO.write(image, "png", output); + System.out.println("验证码图片已保存到: " + output.getAbsolutePath()); + }*/ + + /** + * 执行登录请求 + */ + private String performLogin(CloseableHttpClient httpClient, + CookieStore cookieStore, + String code) throws IOException, InterruptedException { + + System.out.println("执行登录..."); + // 等待一下再发送登录请求 + Thread.sleep(1500 + (long) (Math.random() * 1000)); + + HttpPost loginPost = createLoginRequest(code); + + try (CloseableHttpResponse loginResponse = httpClient.execute(loginPost)) { + return processLoginResponse(loginResponse, cookieStore); + } + } + + /** + * 创建登录请求 + */ + private HttpPost createLoginRequest(String code) throws UnsupportedEncodingException { + HttpPost loginPost = new HttpPost(BASE_URL + "/login"); + + // 设置请求头 + setCommonHeaders(loginPost); + loginPost.setHeader("Referer", BASE_URL + "/login"); + loginPost.setHeader("Origin", BASE_URL); + loginPost.setHeader("Accept", "application/json, text/plain, */*"); + + // 构建登录参数 + List params = new ArrayList<>(); + params.add(new BasicNameValuePair("type", "1")); + params.add(new BasicNameValuePair("account", "pmk1")); + params.add(new BasicNameValuePair("password", "Asd123123")); + params.add(new BasicNameValuePair("code", code)); + + loginPost.setEntity(new UrlEncodedFormEntity(params, "UTF-8")); + + // 禁用自动重定向 + RequestConfig requestConfig = RequestConfig.custom() + .setRedirectsEnabled(false) + .build(); + loginPost.setConfig(requestConfig); + + return loginPost; + } + + /** + * 处理登录响应 + */ + private String processLoginResponse(CloseableHttpResponse loginResponse, + CookieStore cookieStore) throws IOException, InterruptedException { + + int statusCode = loginResponse.getStatusLine().getStatusCode(); + System.out.println("登录响应状态码: " + statusCode); + + // 处理限速 + if (statusCode == 429) { + handleRateLimit(loginResponse); + return null; + } + + // 打印响应头 + printResponseHeaders(loginResponse); + + // 检查重定向 + if (statusCode == 302) { + if (checkRedirectForError(loginResponse)) { + return null; + } + } + + // 读取响应体 + String tokenFromBody = extractTokenFromResponseBody(loginResponse); + if (tokenFromBody != null) { + return tokenFromBody; + } + + // 从cookies中提取token + return extractTokenFromCookies(cookieStore, statusCode); + } + + /** + * 处理速率限制 + */ + private void handleRateLimit(CloseableHttpResponse response) throws InterruptedException { + System.out.println("登录请求被限速 (429 Too Many Requests)"); + + Header retryAfterHeader = response.getFirstHeader("Retry-After"); + if (retryAfterHeader != null) { + try { + int retryAfterSeconds = Integer.parseInt(retryAfterHeader.getValue()); + System.out.println("服务器要求等待 " + retryAfterSeconds + " 秒"); + Thread.sleep(retryAfterSeconds * 1000L); + } catch (NumberFormatException e) { + System.out.println("等待5秒后重试"); + Thread.sleep(5000); + } + } else { + System.out.println("等待3秒后重试"); + Thread.sleep(3000); + } + } + + /** + * 检查重定向是否包含错误 + */ + private boolean checkRedirectForError(CloseableHttpResponse response) { + Header locationHeader = response.getFirstHeader("Location"); + if (locationHeader != null) { + String location = locationHeader.getValue(); + System.out.println("重定向到: " + location); + + if (location.contains("e=3")) { + System.out.println("验证码错误 (e=3)"); + return true; + } + } + return false; + } + + /** + * 打印响应头 + */ + private void printResponseHeaders(CloseableHttpResponse response) { + System.out.println("响应头:"); + for (Header header : response.getAllHeaders()) { + System.out.println(" " + header.getName() + ": " + header.getValue()); + } + } + + /** + * 从响应体中提取token + */ + private String extractTokenFromResponseBody(CloseableHttpResponse response) throws IOException { + if (response.getEntity() != null) { + String responseBody = EntityUtils.toString(response.getEntity(), "UTF-8"); + if (responseBody != null && !responseBody.isEmpty()) { + System.out.println("响应体: " + responseBody); + + // 检查响应体中是否有token(JSON格式) + if (responseBody.contains("\"token\"")) { + // 简单提取token + int start = responseBody.indexOf("\"token\":\""); + if (start != -1) { + start += 9; + int end = responseBody.indexOf("\"", start); + if (end != -1) { + String token = responseBody.substring(start, end); + System.out.println("\n[SUCCESS] 从响应体找到Token!"); + System.out.println("Token: " + token); + return token; + } + } + } + } + // 消耗实体 + EntityUtils.consume(response.getEntity()); + } + return null; + } + + /** + * 从cookies中提取token + */ + private String extractTokenFromCookies(CookieStore cookieStore, int statusCode) { + List cookies = cookieStore.getCookies(); + System.out.println("所有cookies (" + cookies.size() + "个):"); + + String token = null; + for (Cookie cookie : cookies) { + System.out.println(" " + cookie.getName() + " = " + cookie.getValue()); + + if ("token".equals(cookie.getName()) || + cookie.getName().toLowerCase().contains("token")) { + token = cookie.getValue(); + } + } + + if (token != null && !token.isEmpty()) { + System.out.println("\n[SUCCESS] Login OK!"); + System.out.println("Token: " + token); + return token; + } else if (statusCode == 200) { + System.out.println("登录返回200但没有找到token,可能需要检查其他认证方式"); + } + + return null; + } + + /** + * 设置通用请求头 + */ + private void setCommonHeaders(HttpRequestBase request) { + request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); + request.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); + request.setHeader("Accept-Encoding", "gzip, deflate, br"); + request.setHeader("Connection", "keep-alive"); + request.setHeader("Upgrade-Insecure-Requests", "1"); + } + + /** + * 等待重试 + */ + private void waitForRetry(int attempt) throws InterruptedException { + System.out.println("\n等待2秒后进行下一次尝试..."); + Thread.sleep(2000); + } +} + diff --git a/src/main/java/com/tem/bocai/util/ImageOcrService.java b/src/main/java/com/tem/bocai/util/ImageOcrService.java index 909fced..49c8557 100644 --- a/src/main/java/com/tem/bocai/util/ImageOcrService.java +++ b/src/main/java/com/tem/bocai/util/ImageOcrService.java @@ -3,6 +3,16 @@ package com.tem.bocai.util; import net.sourceforge.tess4j.Tesseract; import net.sourceforge.tess4j.TesseractException; import org.apache.commons.io.IOUtils; +import org.apache.http.Header; +import org.apache.http.NameValuePair; +import org.apache.http.client.CookieStore; +import org.apache.http.client.entity.UrlEncodedFormEntity; +import org.apache.http.client.methods.*; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.message.BasicNameValuePair; +import org.apache.http.util.EntityUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.Resource; import org.springframework.core.io.ResourceLoader; @@ -11,16 +21,25 @@ import org.springframework.stereotype.Service; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; +import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; + +import org.apache.http.client.config.RequestConfig; +import org.apache.http.cookie.Cookie; @Service public class ImageOcrService { - private final Tesseract tesseract; private final ResourceLoader resourceLoader; - + private static CloseableHttpClient httpClient; + private static final String BASE_URL = "https://4701268539-esh.qdk63ayw8g.com"; @Autowired public ImageOcrService(Tesseract tesseract, ResourceLoader resourceLoader) { @@ -44,13 +63,220 @@ public class ImageOcrService { /** * 从远程 URL 获取图片并进行 OCR 处理 * - * @param imageUrl 图片 URL * @return OCR 结果文本 */ - public String ocrRemoteImage(String imageUrl) throws IOException, TesseractException { + public String ocrRemoteImage() throws IOException, TesseractException, InterruptedException { + int maxRetry = 5; - byte[] imageData = IOUtils.toByteArray(new ByteArrayInputStream(IOUtils.toByteArray(imageUrl))); - BufferedImage image = ImageIO.read(new ByteArrayInputStream(imageData)); - return tesseract.doOCR(image); + for (int attempt = 1; attempt <= maxRetry; attempt++) { + System.out.println("\n=== 第 " + attempt + " 次尝试 ==="); + + // 每次尝试都创建新的HttpClient和CookieStore + CookieStore cookieStore = new BasicCookieStore(); + CloseableHttpClient httpClient = HttpClients.custom() + .setDefaultCookieStore(cookieStore) + .build(); + + try { + // 1. 获取验证码 + System.out.println("获取验证码..."); + HttpGet getCaptcha = new HttpGet(BASE_URL + "/code"); + setCommonHeaders(getCaptcha); + // 添加Referer头 + getCaptcha.setHeader("Referer", BASE_URL + "/login"); + // 添加随机延迟,避免请求过快 + Thread.sleep(1000 + (long)(Math.random() * 1000)); + + CloseableHttpResponse captchaResponse = httpClient.execute(getCaptcha); + byte[] imageData = null; + + try { + int captchaStatus = captchaResponse.getStatusLine().getStatusCode(); + System.out.println("验证码响应状态码: " + captchaStatus); + + if (captchaStatus == 200) { + imageData = EntityUtils.toByteArray(captchaResponse.getEntity()); + } else if (captchaStatus == 429) { + System.out.println("获取验证码被限速,等待后重试..."); + Thread.sleep(3000); // 等待3秒 + continue; // 继续下一次尝试 + } else { + System.out.println("获取验证码失败: " + captchaStatus); + continue; + } + } finally { + captchaResponse.close(); + } + + // 2. OCR识别验证码 + String code = null; + if (imageData != null) { + BufferedImage image = ImageIO.read(new ByteArrayInputStream(imageData)); + code = tesseract.doOCR(image); + + // 清理验证码 + code = code.replaceAll("\\s+", "").trim(); + code = code.replaceAll("[^0-9]", ""); // 只保留数字 + + System.out.println("OCR原始结果: " + tesseract.doOCR(image)); + System.out.println("清理后验证码: [" + code + "] 长度: " + code.length()); + + // 保存图片用于调试 + File output = new File("captcha_attempt_" + attempt + ".png"); + ImageIO.write(image, "png", output); + System.out.println("验证码图片已保存到: " + output.getAbsolutePath()); + + if (code.length() != 4) { + System.out.println("验证码长度不是4位,跳过本次尝试"); + continue; + } + } else { + System.out.println("验证码数据为空"); + continue; + } + + // 等待一下再发送登录请求 + Thread.sleep(1500 + (long)(Math.random() * 1000)); + + // 3. 登录(不自动重定向) + System.out.println("执行登录..."); + HttpPost loginPost = new HttpPost(BASE_URL + "/login"); + setCommonHeaders(loginPost); + // 重要:添加Referer和Origin头 + loginPost.setHeader("Referer", BASE_URL + "/login"); + loginPost.setHeader("Origin", BASE_URL); + loginPost.setHeader("Accept", "application/json, text/plain, */*"); + // 构建登录参数 + List params = new ArrayList<>(); + params.add(new BasicNameValuePair("type", "1")); + params.add(new BasicNameValuePair("account", "pmk1")); + params.add(new BasicNameValuePair("password", "Asd123123")); + params.add(new BasicNameValuePair("code", code)); + loginPost.setEntity(new UrlEncodedFormEntity(params, "UTF-8")); + // 禁用自动重定向 + RequestConfig requestConfig = RequestConfig.custom() + .setRedirectsEnabled(false) + .build(); + loginPost.setConfig(requestConfig); + CloseableHttpResponse loginResponse = httpClient.execute(loginPost); + try { + int statusCode = loginResponse.getStatusLine().getStatusCode(); + System.out.println("登录响应状态码: " + statusCode); + + // 处理429错误 + if (statusCode == 429) { + System.out.println("登录请求被限速 (429 Too Many Requests)"); + + // 检查Retry-After头 + Header retryAfterHeader = loginResponse.getFirstHeader("Retry-After"); + if (retryAfterHeader != null) { + try { + int retryAfterSeconds = Integer.parseInt(retryAfterHeader.getValue()); + System.out.println("服务器要求等待 " + retryAfterSeconds + " 秒"); + Thread.sleep(retryAfterSeconds * 1000L); + } catch (NumberFormatException e) { + System.out.println("等待5秒后重试"); + Thread.sleep(5000); + } + } else { + System.out.println("等待3秒后重试"); + Thread.sleep(3000); + } + continue; // 继续下一次尝试 + } + // 打印响应头 + System.out.println("响应头:"); + for (Header header : loginResponse.getAllHeaders()) { + System.out.println(" " + header.getName() + ": " + header.getValue()); + } + // 检查是否是重定向 + if (statusCode == 302) { + Header locationHeader = loginResponse.getFirstHeader("Location"); + if (locationHeader != null) { + String location = locationHeader.getValue(); + System.out.println("重定向到: " + location); + + if (location.contains("e=3")) { + System.out.println("验证码错误 (e=3)"); + continue; // 继续下一次尝试 + } + } + } + // 读取响应体(如果有) + if (loginResponse.getEntity() != null) { + String responseBody = EntityUtils.toString(loginResponse.getEntity(), "UTF-8"); + if (responseBody != null && !responseBody.isEmpty()) { + System.out.println("响应体: " + responseBody); + + // 检查响应体中是否有token(JSON格式) + if (responseBody.contains("\"token\"")) { + // 简单提取token + int start = responseBody.indexOf("\"token\":\""); + if (start != -1) { + start += 9; + int end = responseBody.indexOf("\"", start); + if (end != -1) { + String token = responseBody.substring(start, end); + System.out.println("\n[SUCCESS] 从响应体找到Token!"); + System.out.println("Token: " + token); + return token; + } + } + } + } + // 消耗实体 + EntityUtils.consume(loginResponse.getEntity()); + } + // 4. 检查cookies中是否有token + String token = null; + List cookies = cookieStore.getCookies(); + System.out.println("所有cookies (" + cookies.size() + "个):"); + for (Cookie cookie : cookies) { + System.out.println(" " + cookie.getName() + " = " + cookie.getValue()); + + // 查找token + if ("token".equals(cookie.getName()) || + cookie.getName().toLowerCase().contains("token")) { + token = cookie.getValue(); + } + } + + if (token != null && !token.isEmpty()) { + System.out.println("\n[SUCCESS] Login OK!"); + System.out.println("Token: " + token); + return token; + } else if (statusCode == 200) { + // 如果是200状态码但没有token,可能是登录成功但token在其他地方 + System.out.println("登录返回200但没有找到token,可能需要检查其他认证方式"); + } + + } finally { + loginResponse.close(); + } + + } finally { + httpClient.close(); + } + // 如果不是最后一次尝试,等待一段时间 + if (attempt < maxRetry) { + System.out.println("\n等待2秒后进行下一次尝试..."); + Thread.sleep(2000); + } + } + + System.out.println("\n[FAILED] " + maxRetry + " 次尝试都失败了"); + return ""; + } + + private void setCommonHeaders(HttpRequestBase request) { + request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"); + request.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); + request.setHeader("Accept-Encoding", "gzip, deflate, br"); + request.setHeader("Connection", "keep-alive"); + request.setHeader("Upgrade-Insecure-Requests", "1"); + request.setHeader("Sec-Fetch-Dest", "document"); + request.setHeader("Sec-Fetch-Mode", "navigate"); + request.setHeader("Sec-Fetch-Site", "same-origin"); + request.setHeader("Sec-Fetch-User", "?1"); } } diff --git a/src/main/java/com/tem/bocai/util/ImageTest.java b/src/main/java/com/tem/bocai/util/ImageTest.java index 4285fa3..05f651f 100644 --- a/src/main/java/com/tem/bocai/util/ImageTest.java +++ b/src/main/java/com/tem/bocai/util/ImageTest.java @@ -9,7 +9,7 @@ import java.util.Map; public class ImageTest { public static void main(String[] args) throws Exception { - getImageStreamAndCookie("https://4701268539-esh.qdk63ayw8g.com/code?_=1768901529986"); + getImageStreamAndCookie("https://4701268539-esh.qdk63ayw8g.com/code"); } /** diff --git a/src/main/java/com/tem/bocai/util/LotteryDataPipeline.java b/src/main/java/com/tem/bocai/util/LotteryDataPipeline.java new file mode 100644 index 0000000..4a1e465 --- /dev/null +++ b/src/main/java/com/tem/bocai/util/LotteryDataPipeline.java @@ -0,0 +1,29 @@ +package com.tem.bocai.util; + + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class LotteryDataPipeline implements Pipeline { + + private List> lotteryData = new ArrayList<>(); + + @Override + public void process(ResultItems resultItems, Task task) { + // 从ResultItems中获取数据 + List> data = resultItems.get("lotteryData"); + if (data != null && !data.isEmpty()) { + lotteryData.addAll(data); + System.out.println("Pipeline处理数据: " + data.size() + " 条"); + } + } + + public List> getLotteryData() { + return lotteryData; + } +} diff --git a/src/main/java/com/tem/bocai/util/LotteryWebMagicCrawler.java b/src/main/java/com/tem/bocai/util/LotteryWebMagicCrawler.java new file mode 100644 index 0000000..9c551ba --- /dev/null +++ b/src/main/java/com/tem/bocai/util/LotteryWebMagicCrawler.java @@ -0,0 +1,277 @@ +package com.tem.bocai.util; + +import com.fasterxml.jackson.databind.SerializationFeature; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.Selectable; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +public class LotteryWebMagicCrawler implements PageProcessor { + + private final String token; + // 站点配置 + private Site site; + // final LoginService loginService; + + + public LotteryWebMagicCrawler(String token) { + this.token = token; + initSite(); + } + + /** + * 初始化Site配置 + */ + private void initSite() { + site = Site.me() + .setRetryTimes(3) + .setSleepTime(1000) + .setTimeOut(10000) + .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36"); + + // 设置cookie + if (token != null && !token.isEmpty()) { + site.addHeader("cookie", "token=" + token); + } + } + + + @Override + public void process(Page page) { + // 获取页面HTML + Html html = page.getHtml(); + + // 打印页面基本信息 + System.out.println("页面URL: " + page.getUrl()); + System.out.println("页面标题: " + html.xpath("//title/text()").get()); + + // 示例:提取所有表格数据 + Selectable tables = html.xpath("//table"); + System.out.println("找到 " + tables.nodes().size() + " 个表格"); + + // 提取表格数据(根据实际页面结构调整选择器) + extractTableData(html); + + // 示例:提取所有链接 + Selectable links = html.links(); + System.out.println("页面包含 " + links.all().size() + " 个链接"); + + // 如果需要继续爬取其他页面 + // page.addTargetRequests(links.all()); + + // 将数据存入结果 + /* page.putField("html========", html.toString()); + page.putField("title", html.xpath("//title/text()").get());*/ + parseLotteryHtml(html.toString()); + + } + + private void extractTableData(Html html) { + // 根据实际页面结构编写数据提取逻辑 + // 示例:提取所有tr元素 + Selectable rows = html.xpath("//tr"); + for (Selectable row : rows.nodes()) { + // 提取每行的td内容 + String rowText = row.xpath("//td/text()").all().toString(); + if (!rowText.isEmpty()) { + System.out.println("行数据: " + rowText); + } + } + } + + @Override + public Site getSite() { + + return site; + } + + + /** + * 解析彩票HTML数据,转换成指定的List>格式 + * + * @param htmlContent 爬取到的HTML文本内容 + * @return 解析后的结构化数据列表 + */ + public static List> parseLotteryHtml(String htmlContent) { + List> resultList = new ArrayList<>(); + + // 初始化Jsoup解析器 + Document doc = Jsoup.parse(htmlContent); + + // 定位到数据所在的表格行(drawTable下的table > tbody > tr) + Element targetTable = doc.selectFirst("#drawTable"); + if (targetTable == null) { + return resultList; + } + + Elements trList = targetTable.select("table > tbody > tr"); + + // 遍历每一行数据 + for (Element tr : trList) { + Map rowData = new HashMap<>(); + + // 1. 提取期数(id) + Element periodTd = tr.selectFirst("td.period"); + rowData.put("id", periodTd != null ? periodTd.text().trim() : ""); + + // 2. 提取开奖时间(time) + Element timeTd = tr.selectFirst("td.drawTime"); + rowData.put("time", timeTd != null ? timeTd.text().trim() : ""); + + // 3. 提取开出号码(result)- 10个ballname的数字 + Elements ballTds = tr.select("td.ballname"); + List resultNumbers = new ArrayList<>(); + int count = 0; + for (Element td : ballTds) { + if (count >= 10) break; + String text = td.text().trim(); + if (text.matches("\\d+")) { + resultNumbers.add(Integer.parseInt(text)); + count++; + } + } + rowData.put("result", resultNumbers); + + // 4. 提取winner(other1) + Element winnerTd = tr.selectFirst("td.other1"); + if (winnerTd != null) { + String winnerText = winnerTd.text().trim(); + if (winnerText.matches("\\d+")) { + rowData.put("winner", Integer.parseInt(winnerText)); + } else { + rowData.put("winner", ""); + } + } else { + rowData.put("winner", ""); + } + + // 5. 提取GD1(冠亚小/大)、GD2(冠亚单/双) + Elements otherTds = tr.select("td.other"); + String gd1 = ""; + String gd2 = ""; + for (Element td : otherTds) { + String className = td.className(); + if (className.contains("GDX")) { + gd1 = td.text().trim(); + } else if (className.contains("GDS")) { + gd2 = td.text().trim(); + } + } + rowData.put("GD1", gd1); + rowData.put("GD2", gd2); + + // 6. 提取sum1(dldhl_sum)、sum2(dldhh_sum) + Element sum1Td = tr.selectFirst("td.dldhl_sum"); + if (sum1Td != null) { + String sum1Text = sum1Td.text().trim(); + if (sum1Text.matches("\\d+")) { + rowData.put("sum1", Integer.parseInt(sum1Text)); + } else { + rowData.put("sum1", ""); + } + } else { + rowData.put("sum1", ""); + } + + Element sum2Td = tr.selectFirst("td.dldhh_sum"); + if (sum2Td != null) { + String sum2Text = sum2Td.text().trim(); + if (sum2Text.matches("\\d+")) { + rowData.put("sum2", Integer.parseInt(sum2Text)); + } else { + rowData.put("sum2", ""); + } + } else { + rowData.put("sum2", ""); + } + + // 7. 提取GLH_result(龙虎结果,5个GLH开头的td) + List glhResults = new ArrayList<>(); + int glhCount = 0; + for (Element td : otherTds) { + if (glhCount >= 5) break; + String className = td.className(); + if (className.contains("GLH_")) { + glhResults.add(td.text().trim()); + glhCount++; + } + } + rowData.put("GLH_result", glhResults); + + // 将单行数据加入结果列表(只保留有期数的有效行) + if (!rowData.get("id").toString().isEmpty()) { + resultList.add(rowData); + } + } + writeToJsonFile(resultList); + System.out.println("打印结果===" + resultList); + return resultList; + } + + public static void writeToJsonFile(List> resultList) { + try { + // 创建 ObjectMapper 实例 + ObjectMapper objectMapper = new ObjectMapper(); + + // 设置 JSON 格式化(可选,更易读) + objectMapper.enable(SerializationFeature.INDENT_OUTPUT); + + // 定义输出目录和文件名 + String directoryPath = "output/json"; // 项目根目录下的 output/json 文件夹 + String fileName = "result_" + System.currentTimeMillis() + ".json"; + String filePath = directoryPath + "/" + fileName; + + // 创建目录(如果不存在) + File directory = new File(directoryPath); + if (!directory.exists()) { + directory.mkdirs(); // 创建多级目录 + } + // 创建文件对象 + File outputFile = new File(filePath); + // 将 List 写入 JSON 文件 + objectMapper.writeValue(outputFile, resultList); + System.out.println("数据已成功写入文件: " + outputFile.getAbsolutePath()); + + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException("写入 JSON 文件失败: " + e.getMessage(), e); + } + } + + + public static void main(String[] args) { + String url = "https://4701268539-esh.qdk63ayw8g.com/member/dresult?lottery=SGFT&date=2026-01-18"; + + // 创建爬虫 + Spider.create(new LotteryWebMagicCrawler("")) + .addUrl(url) // 添加起始URL + .thread(1) // 线程数 + .run(); // 开始爬取 + } + + + // 自定义headers + /*private Map getHeaders() { + Map headers = new HashMap<>(); + headers.put("cookie", "token=a1b219fe7e39374d6af532c56fdc911b76ae8f83"); + + return headers; + }*/ +} \ No newline at end of file diff --git a/src/main/java/com/tem/bocai/util/TokenCacheManager.java b/src/main/java/com/tem/bocai/util/TokenCacheManager.java new file mode 100644 index 0000000..faeed4f --- /dev/null +++ b/src/main/java/com/tem/bocai/util/TokenCacheManager.java @@ -0,0 +1,217 @@ +package com.tem.bocai.util; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; + +@Component +public class TokenCacheManager { + + private static final String CACHE_FILE_PATH = "token_cache.json"; + private static final long TOKEN_EXPIRE_TIME = 1140000; // 19分钟过期(毫秒) 1140000 = 19 * 60 * 1000 + + private final ObjectMapper objectMapper = new ObjectMapper(); + + /** + * 保存token到缓存文件 + */ + public void saveToken(String token) { + try { + Map cacheData = new HashMap<>(); + cacheData.put("token", token); + cacheData.put("timestamp", System.currentTimeMillis()); + cacheData.put("expireTime", TOKEN_EXPIRE_TIME); + cacheData.put("expireMinutes", 19); // 记录过期分钟数 + + String json = objectMapper.writeValueAsString(cacheData); + Files.write(Paths.get(CACHE_FILE_PATH), json.getBytes()); + + System.out.println("Token已保存到缓存文件: " + CACHE_FILE_PATH); + System.out.println("Token将在19分钟后过期"); + + } catch (IOException e) { + System.err.println("保存token缓存失败: " + e.getMessage()); + } + } + + /** + * 从缓存文件读取token + */ + public String readToken() { + try { + File cacheFile = new File(CACHE_FILE_PATH); + if (!cacheFile.exists()) { + System.out.println("缓存文件不存在"); + return null; + } + + String json = new String(Files.readAllBytes(Paths.get(CACHE_FILE_PATH))); + Map cacheData = objectMapper.readValue(json, Map.class); + + String token = (String) cacheData.get("token"); + Long timestamp = (Long) cacheData.get("timestamp"); + + if (!StringUtils.hasText(token) || timestamp == null) { + System.out.println("缓存数据不完整"); + return null; + } + + // 检查token是否过期 + long currentTime = System.currentTimeMillis(); + long elapsedTime = currentTime - timestamp; + + if (elapsedTime > TOKEN_EXPIRE_TIME) { + long expiredSeconds = (elapsedTime - TOKEN_EXPIRE_TIME) / 1000; + System.out.println("Token已过期 " + expiredSeconds + " 秒"); + return null; + } + + // 计算剩余时间 + long remainingTime = TOKEN_EXPIRE_TIME - elapsedTime; + long remainingMinutes = remainingTime / 60000; + long remainingSeconds = (remainingTime % 60000) / 1000; + + System.out.println("从缓存读取token,剩余有效时间: " + + remainingMinutes + "分" + remainingSeconds + "秒"); + return token; + + } catch (IOException e) { + System.err.println("读取token缓存失败: " + e.getMessage()); + return null; + } + } + + /** + * 清除token缓存 + */ + public void clearToken() { + try { + File cacheFile = new File(CACHE_FILE_PATH); + if (cacheFile.exists()) { + Files.delete(Paths.get(CACHE_FILE_PATH)); + System.out.println("Token缓存已清除"); + } + } catch (IOException e) { + System.err.println("清除token缓存失败: " + e.getMessage()); + } + } + + /** + * 检查token是否存在且有效 + */ + public boolean hasValidToken() { + return readToken() != null; + } + + /** + * 检查token是否即将过期(例如5分钟内过期) + */ + public boolean isTokenExpiringSoon(int warningMinutes) { + try { + File cacheFile = new File(CACHE_FILE_PATH); + if (!cacheFile.exists()) { + return false; + } + + String json = new String(Files.readAllBytes(Paths.get(CACHE_FILE_PATH))); + Map cacheData = objectMapper.readValue(json, Map.class); + + Long timestamp = (Long) cacheData.get("timestamp"); + if (timestamp == null) { + return false; + } + + long currentTime = System.currentTimeMillis(); + long elapsedTime = currentTime - timestamp; + long remainingTime = TOKEN_EXPIRE_TIME - elapsedTime; + + // 检查是否在指定分钟内过期 + return remainingTime > 0 && remainingTime <= (warningMinutes * 60000); + + } catch (Exception e) { + return false; + } + } + + /** + * 获取token信息(用于调试) + */ + public Map getTokenInfo() { + try { + File cacheFile = new File(CACHE_FILE_PATH); + if (!cacheFile.exists()) { + return null; + } + + String json = new String(Files.readAllBytes(Paths.get(CACHE_FILE_PATH))); + Map cacheData = objectMapper.readValue(json, Map.class); + + Long timestamp = (Long) cacheData.get("timestamp"); + if (timestamp != null) { + long currentTime = System.currentTimeMillis(); + long elapsedTime = currentTime - timestamp; + long remainingTime = TOKEN_EXPIRE_TIME - elapsedTime; + + cacheData.put("savedTime", new Date(timestamp).toString()); + cacheData.put("elapsedTime", formatTime(elapsedTime)); + cacheData.put("remainingTime", formatTime(remainingTime)); + cacheData.put("isValid", remainingTime > 0); + cacheData.put("expireMinutes", 19); + + // 添加过期警告 + if (remainingTime > 0) { + if (remainingTime <= 300000) { // 5分钟内过期 + cacheData.put("warning", "Token将在5分钟内过期,建议刷新"); + } else if (remainingTime <= 600000) { // 10分钟内过期 + cacheData.put("warning", "Token将在10分钟内过期"); + } + } + } + + return cacheData; + + } catch (Exception e) { + return null; + } + } + + /** + * 格式化时间显示 + */ + private String formatTime(long millis) { + if (millis <= 0) { + return "已过期"; + } + + long minutes = millis / 60000; + long seconds = (millis % 60000) / 1000; + + if (minutes > 0) { + return minutes + "分" + seconds + "秒"; + } else { + return seconds + "秒"; + } + } + + /** + * 获取token过期时间配置 + */ + public static long getTokenExpireTime() { + return TOKEN_EXPIRE_TIME; + } + + /** + * 获取token过期分钟数 + */ + public static int getTokenExpireMinutes() { + return (int) (TOKEN_EXPIRE_TIME / 60000); + } +} \ No newline at end of file