调整脚本

2026-01-23 17:38:42 +08:00
parent 78fb532fa2
commit 64ba2b1ead
4 changed files with 2862 additions and 9 deletions
--- a/PyModel/aggregate_data_v7.py
+++ b/PyModel/aggregate_data_v7.py
@@ -0,0 +1,94 @@
+import json
+import os
+import pandas as pd
+import numpy as np
+from datetime import timedelta
+
+# 1. 整合数据
+# data_dirs = ['data_test_dir', 'data_test_dir1']
+data_dirs = ['data_test_dir']
+all_records = []
+
+for d in data_dirs:
+    if not os.path.exists(d): continue
+    for f in os.listdir(d):
+        if f.endswith('.json') and f != 'stat_result.json':
+            with open(os.path.join(d, f), 'r') as file:
+                try:
+                    all_records.extend(json.load(file))
+                except:
+                    continue
+
+df = pd.DataFrame(all_records)
+df = df.drop_duplicates(subset=['id'], keep='last')
+df['time'] = pd.to_datetime(df['time'])
+print(f"Total unique records: {len(df)}")
+
+
+# 2. 极速统计函数
+def calculate_stats_fast(group):
+    if group.empty: return {}
+    stats = {}
+    stats['winner_prob'] = group['winner'].value_counts(normalize=True).to_dict()
+    stats['GD1_prob'] = (group['winner'] >= 12).map({True: '冠亚大', False: '冠亚小'}).value_counts(
+        normalize=True).to_dict()
+    stats['GD2_prob'] = group['GD2'].value_counts(normalize=True).to_dict()
+
+    res_df = pd.DataFrame(group['result'].tolist())
+    pos_probs = {}
+    pos_detail_probs = {}
+    for col in range(10):
+        col_data = res_df[col]
+        pos_probs[f'pos_{col}'] = col_data.value_counts(normalize=True).to_dict()
+        is_big = (col_data >= 6).map({True: '大', False: '小'})
+        is_odd = (col_data % 2 != 0).map({True: '单', False: '双'})
+        pos_detail_probs[f'pos_{col}'] = {
+            'big_small': is_big.value_counts(normalize=True).to_dict(),
+            'odd_even': is_odd.value_counts(normalize=True).to_dict()
+        }
+    stats['result_pos_prob'] = pos_probs
+    stats['result_pos_detail_prob'] = pos_detail_probs
+
+    glh_df = pd.DataFrame(group['GLH_result'].tolist())
+    glh_pos_probs = {}
+    for col in range(5):
+        glh_pos_probs[f'pos_{col}'] = glh_df[col].value_counts(normalize=True).to_dict()
+    stats['GLH_pos_prob'] = glh_pos_probs
+    return stats
+
+
+# 3. 多维度聚合
+df['hour_min'] = df['time'].dt.strftime('%H:%M:%S')
+df['day_of_month'] = df['time'].dt.day
+df['day_of_week'] = df['time'].dt.dayofweek
+
+# 全量统计
+print("Calculating full history stats...")
+time_stats = df.groupby('hour_min').apply(calculate_stats_fast).to_dict()
+date_stats = df.groupby('day_of_month').apply(calculate_stats_fast).to_dict()
+week_stats = df.groupby('day_of_week').apply(calculate_stats_fast).to_dict()
+
+# 最近 100 天统计
+print("Calculating recent 100 days stats...")
+first_date = df['time'].min()
+last_date = df['time'].max()
+# 总预测命中率大概在 0.3241
+# start_date_last_0000_1d = last_date - timedelta(days=int(len(df["time"]) * 0.3241 / 276))
+# 取前 x 天，多了这 x 天会影响概率分布 + 万份之一
+start_date_last_0000_2d = last_date - timedelta(days=(0.0001 * len(df["time"]) / 276) + 1)
+df_0000_2d = df[df['time'] >= start_date_last_0000_2d]
+time_stats_0000_2d = df_0000_2d.groupby('hour_min').apply(calculate_stats_fast).to_dict()
+
+# 4. 保存结果
+output_data = {
+    'by_time': time_stats,
+    'by_time_recent_0000_2d': time_stats_0000_2d,
+    'by_date': date_stats,
+    'by_week': week_stats,
+    'last_updated': last_date.strftime('%Y-%m-%d %H:%M:%S')
+}
+
+with open('data_test_predict/aggregated_stats_v7.json', 'w') as f:
+    json.dump(output_data, f)
+
+print(f"Stats V7 generated with 100-day window. Last data point: {last_date}")