调整脚本

This commit is contained in:
2026-01-23 18:04:11 +08:00
parent 5eac38c0c1
commit c3597e57f6
4 changed files with 56990 additions and 77 deletions

View File

@@ -10,19 +10,43 @@ data_dirs = ['data_test_dir']
all_records = []
for d in data_dirs:
if not os.path.exists(d): continue
for f in os.listdir(d):
print(f"Processing directory: {d}")
if not os.path.exists(d):
print(f"Directory {d} does not exist")
continue
print(f"Found directory: {d}")
files = os.listdir(d)
print(f"Files in directory: {files}")
for f in files:
if f.endswith('.json') and f != 'stat_result.json':
with open(os.path.join(d, f), 'r') as file:
try:
all_records.extend(json.load(file))
except:
continue
file_path = os.path.join(d, f)
print(f"Processing file: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
print(f"Loaded {len(data)} records from {f}")
if len(data) > 0:
print(f"First record keys: {list(data[0].keys())}")
all_records.extend(data)
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
continue
print(f"Total records loaded: {len(all_records)}")
df = pd.DataFrame(all_records)
df = df.drop_duplicates(subset=['id'], keep='last')
df['time'] = pd.to_datetime(df['time'])
print(f"Total unique records: {len(df)}")
print(f"DataFrame columns: {list(df.columns)}")
if 'id' in df.columns:
df = df.drop_duplicates(subset=['id'], keep='last')
print(f"Total unique records after deduplication: {len(df)}")
else:
print("No 'id' column found in DataFrame")
if 'time' in df.columns:
df['time'] = pd.to_datetime(df['time'])
print(f"Total unique records: {len(df)}")
else:
print("No 'time' column found in DataFrame")
print("Sample of first 5 records:")
print(df.head())
# 2. 极速统计函数
@@ -88,7 +112,18 @@ output_data = {
'last_updated': last_date.strftime('%Y-%m-%d %H:%M:%S')
}
with open('data_test_predict/aggregated_stats_v7.json', 'w') as f:
# 创建data_test_predict目录如果不存在
predict_dir = 'data_test_predict'
if not os.path.exists(predict_dir):
print(f"Creating directory: {predict_dir}")
os.makedirs(predict_dir)
else:
print(f"Directory {predict_dir} already exists")
# 保存结果
output_file = os.path.join(predict_dir, 'aggregated_stats_v7.json')
print(f"Saving results to: {output_file}")
with open(output_file, 'w') as f:
json.dump(output_data, f)
print(f"Stats V7 generated with 100-day window. Last data point: {last_date}")

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff