调整脚本
This commit is contained in:
8
.gitignore
vendored
8
.gitignore
vendored
@@ -34,13 +34,7 @@ out/
|
|||||||
.copilot/
|
.copilot/
|
||||||
|
|
||||||
# Maven 构建目录
|
# Maven 构建目录
|
||||||
/ruoyi-common/target/
|
|
||||||
/ruoyi-system/target/
|
|
||||||
/ruoyi-quartz/target/
|
|
||||||
/ruoyi-generator/target/
|
|
||||||
/ruoyi-framework/target/
|
|
||||||
/ruoyi-admin/target/
|
|
||||||
/erp_client_sb/target/
|
|
||||||
target/
|
target/
|
||||||
*.class
|
*.class
|
||||||
|
|
||||||
|
|||||||
94
PyModel/aggregate_data_v7.py
Normal file
94
PyModel/aggregate_data_v7.py
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
# 1. 整合数据
|
||||||
|
# data_dirs = ['data_test_dir', 'data_test_dir1']
|
||||||
|
data_dirs = ['data_test_dir']
|
||||||
|
all_records = []
|
||||||
|
|
||||||
|
for d in data_dirs:
|
||||||
|
if not os.path.exists(d): continue
|
||||||
|
for f in os.listdir(d):
|
||||||
|
if f.endswith('.json') and f != 'stat_result.json':
|
||||||
|
with open(os.path.join(d, f), 'r') as file:
|
||||||
|
try:
|
||||||
|
all_records.extend(json.load(file))
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
df = pd.DataFrame(all_records)
|
||||||
|
df = df.drop_duplicates(subset=['id'], keep='last')
|
||||||
|
df['time'] = pd.to_datetime(df['time'])
|
||||||
|
print(f"Total unique records: {len(df)}")
|
||||||
|
|
||||||
|
|
||||||
|
# 2. 极速统计函数
|
||||||
|
def calculate_stats_fast(group):
|
||||||
|
if group.empty: return {}
|
||||||
|
stats = {}
|
||||||
|
stats['winner_prob'] = group['winner'].value_counts(normalize=True).to_dict()
|
||||||
|
stats['GD1_prob'] = (group['winner'] >= 12).map({True: '冠亚大', False: '冠亚小'}).value_counts(
|
||||||
|
normalize=True).to_dict()
|
||||||
|
stats['GD2_prob'] = group['GD2'].value_counts(normalize=True).to_dict()
|
||||||
|
|
||||||
|
res_df = pd.DataFrame(group['result'].tolist())
|
||||||
|
pos_probs = {}
|
||||||
|
pos_detail_probs = {}
|
||||||
|
for col in range(10):
|
||||||
|
col_data = res_df[col]
|
||||||
|
pos_probs[f'pos_{col}'] = col_data.value_counts(normalize=True).to_dict()
|
||||||
|
is_big = (col_data >= 6).map({True: '大', False: '小'})
|
||||||
|
is_odd = (col_data % 2 != 0).map({True: '单', False: '双'})
|
||||||
|
pos_detail_probs[f'pos_{col}'] = {
|
||||||
|
'big_small': is_big.value_counts(normalize=True).to_dict(),
|
||||||
|
'odd_even': is_odd.value_counts(normalize=True).to_dict()
|
||||||
|
}
|
||||||
|
stats['result_pos_prob'] = pos_probs
|
||||||
|
stats['result_pos_detail_prob'] = pos_detail_probs
|
||||||
|
|
||||||
|
glh_df = pd.DataFrame(group['GLH_result'].tolist())
|
||||||
|
glh_pos_probs = {}
|
||||||
|
for col in range(5):
|
||||||
|
glh_pos_probs[f'pos_{col}'] = glh_df[col].value_counts(normalize=True).to_dict()
|
||||||
|
stats['GLH_pos_prob'] = glh_pos_probs
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
# 3. 多维度聚合
|
||||||
|
df['hour_min'] = df['time'].dt.strftime('%H:%M:%S')
|
||||||
|
df['day_of_month'] = df['time'].dt.day
|
||||||
|
df['day_of_week'] = df['time'].dt.dayofweek
|
||||||
|
|
||||||
|
# 全量统计
|
||||||
|
print("Calculating full history stats...")
|
||||||
|
time_stats = df.groupby('hour_min').apply(calculate_stats_fast).to_dict()
|
||||||
|
date_stats = df.groupby('day_of_month').apply(calculate_stats_fast).to_dict()
|
||||||
|
week_stats = df.groupby('day_of_week').apply(calculate_stats_fast).to_dict()
|
||||||
|
|
||||||
|
# 最近 100 天统计
|
||||||
|
print("Calculating recent 100 days stats...")
|
||||||
|
first_date = df['time'].min()
|
||||||
|
last_date = df['time'].max()
|
||||||
|
# 总预测命中率大概在 0.3241
|
||||||
|
# start_date_last_0000_1d = last_date - timedelta(days=int(len(df["time"]) * 0.3241 / 276))
|
||||||
|
# 取前 x 天,多了这 x 天会影响概率分布 + 万份之一
|
||||||
|
start_date_last_0000_2d = last_date - timedelta(days=(0.0001 * len(df["time"]) / 276) + 1)
|
||||||
|
df_0000_2d = df[df['time'] >= start_date_last_0000_2d]
|
||||||
|
time_stats_0000_2d = df_0000_2d.groupby('hour_min').apply(calculate_stats_fast).to_dict()
|
||||||
|
|
||||||
|
# 4. 保存结果
|
||||||
|
output_data = {
|
||||||
|
'by_time': time_stats,
|
||||||
|
'by_time_recent_0000_2d': time_stats_0000_2d,
|
||||||
|
'by_date': date_stats,
|
||||||
|
'by_week': week_stats,
|
||||||
|
'last_updated': last_date.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
}
|
||||||
|
|
||||||
|
with open('data_test_predict/aggregated_stats_v7.json', 'w') as f:
|
||||||
|
json.dump(output_data, f)
|
||||||
|
|
||||||
|
print(f"Stats V7 generated with 100-day window. Last data point: {last_date}")
|
||||||
2761
PyModel/data_test_dir/result_1769062158609.json
Normal file
2761
PyModel/data_test_dir/result_1769062158609.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -72,8 +72,12 @@ public class ExAggregateDataScriptSchedule {
|
|||||||
|
|
||||||
// 创建ProcessBuilder并设置工作目录为PyModel
|
// 创建ProcessBuilder并设置工作目录为PyModel
|
||||||
ProcessBuilder pb = new ProcessBuilder(command);
|
ProcessBuilder pb = new ProcessBuilder(command);
|
||||||
// 设置工作目录为PyModel
|
// 获取项目根目录的绝对路径
|
||||||
pb.directory(new java.io.File("PyModel"));
|
String projectRoot = System.getProperty("user.dir");
|
||||||
|
// 设置工作目录为PyModel的绝对路径
|
||||||
|
java.io.File pyModelDir = new java.io.File(projectRoot, "PyModel");
|
||||||
|
pb.directory(pyModelDir);
|
||||||
|
System.out.println("执行目录: " + pyModelDir.getAbsolutePath());
|
||||||
|
|
||||||
// 执行Python脚本
|
// 执行Python脚本
|
||||||
Process process = pb.start();
|
Process process = pb.start();
|
||||||
|
|||||||
Reference in New Issue
Block a user