import matplotlib.pyplot as plt
from openpyxl import Workbook
from openpyxl.drawing.image import Image
from openpyxl.styles import Font
from openai import OpenAI
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
logging.basicConfig(filename='data_processing.log',
format='%(asctime)s - %(levelname)s - %(message)s')
file_path = r'C:\\Users\\szqsh\\PycharmProjects\\PythonProject\\Motor_Vehicle.csv'
output_excel_path = r'C:\\Users\\szqsh\\PycharmProjects\\PythonProject\\Motor_Vehicle_Report.xlsx'
'CROSS STREET NAME': str,
'NUMBER OF PERSONS INJURED': pd.Int64Dtype(),
'NUMBER OF PERSONS KILLED': pd.Int64Dtype(),
'NUMBER OF PEDESTRIANS INJURED': pd.Int64Dtype(),
'NUMBER OF PEDESTRIANS KILLED': pd.Int64Dtype(),
'NUMBER OF CYCLIST INJURED': pd.Int64Dtype(),
'NUMBER OF CYCLIST KILLED': pd.Int64Dtype(),
'NUMBER OF MOTORIST INJURED': pd.Int64Dtype(),
'NUMBER OF MOTORIST KILLED': pd.Int64Dtype(),
'CONTRIBUTING FACTOR VEHICLE 1': str,
'CONTRIBUTING FACTOR VEHICLE 2': str,
'CONTRIBUTING FACTOR VEHICLE 3': str,
'CONTRIBUTING FACTOR VEHICLE 4': str,
'CONTRIBUTING FACTOR VEHICLE 5': str,
'VEHICLE TYPE CODE 1': str,
'VEHICLE TYPE CODE 2': str,
'VEHICLE TYPE CODE 3': str,
'VEHICLE TYPE CODE 4': str,
'VEHICLE TYPE CODE 5': str
chunks = pd.read_csv(file_path, dtype=dtype_dict, chunksize=chunk_size, low_memory=False)
ws2 = wb.create_sheet("分析报告")
borough_counts = pd.Series(dtype=int)
time_counts = pd.Series(dtype=int)
injured_killed = pd.Series([0, 0], index=['受伤人数', '死亡人数'])
factor_counts = pd.Series(dtype=int)
for chunk in tqdm(chunks, desc="数据处理进度", unit="块"):
chunk['NUMBER OF PERSONS INJURED'] = chunk['NUMBER OF PERSONS INJURED'].fillna(0).astype(int)
chunk['NUMBER OF PERSONS KILLED'] = chunk['NUMBER OF PERSONS KILLED'].fillna(0).astype(int)
borough_counts.update(chunk['BOROUGH'].value_counts())
chunk['CRASH TIME'] = pd.to_datetime(chunk['CRASH TIME'], format='%H:%M', errors='coerce').dt.hour
time_counts.update(chunk['CRASH TIME'].value_counts())
injured_killed['受伤人数'] += chunk['NUMBER OF PERSONS INJURED'].sum()
injured_killed['死亡人数'] += chunk['NUMBER OF PERSONS KILLED'].sum()
factor_counts.update(chunk['CONTRIBUTING FACTOR VEHICLE 1'].value_counts())
total_rows += len(chunk)
logging.error(f"数据处理错误: {str(e)}")
raise ValueError("输入数据为空")
def create_basic_charts():
title_font = Font(bold=True, size=14)
ws2['A1'].font = title_font
plt.figure(figsize=(10, 6))
borough_counts.sort_values().plot(kind='barh', color='#1f77b4')
plt.title('行政区事故分布', fontsize=14)
plt.savefig(img_buffer, format='png', bbox_inches='tight')
ws2.add_image(Image(img_buffer), 'A3')
plt.figure(figsize=(10, 6))
time_counts.sort_index().plot(kind='area', color='#ff7f0e', alpha=0.5)
plt.title('小时事故分布', fontsize=14)
plt.savefig(img_buffer, format='png')
ws2.add_image(Image(img_buffer), 'A30')
logging.error(f"基础图表生成失败: {str(e)}")
api_key="sk-ozrrgkixmjilrjrtsinrgobzyhcyqclqpomxqidotenkivzk",
base_url="<https://api.siliconflow.cn/v1>"
def generate_prediction():
summary = f"""历史数据摘要(2020-2023):
{borough_counts.nlargest(5).to_string()}
{time_counts.nlargest(3).to_string()}
受伤人数: {injured_killed['受伤人数']:,}
死亡人数: {injured_killed['死亡人数']:,}
{factor_counts.nlargest(5).to_string()}"""
response = client.chat.completions.create(
model='deepseek-ai/DeepSeek-V2.5',
"content": f"{summary}\\n\\n请预测2025年美国各行政区摩托车事故情况,包含:"
"1. 各行政区事故数预测\\n2. 伤亡人数预测\\n3. 主要原因分析\\n4. 安全建议"
ws_pred = wb.create_sheet("AI预测")
ws_pred.append(["预测报告生成时间", pd.Timestamp.now().strftime("%Y-%m-%d %H:%M")])
for line in response.choices[0].message.content.split('\\n'):
ws_pred.append([line.strip()])
logging.error(f"预测失败: {str(e)}")
ws_pred = wb.create_sheet("AI预测")
ws_pred.append(["预测报告生成失败", str(e)])
def create_advanced_visuals():
ws_visual = wb.create_sheet("可视化报告")
plt.style.use('seaborn-v0_8') # 修正样式引用
plt.figure(figsize=(12, 6))
'BROOKLYN': [490000, 70000, 350],
'QUEENS': [410000, 60000, 300],
'MANHATTAN': [340000, 50000, 250]
df = pd.DataFrame(data, index=['事故数', '受伤人数', '死亡人数']).T
ax = df.plot(kind='bar', color=['#2ca02c', '#d62728', '#9467bd'],
edgecolor='black', linewidth=0.8)
if v > 10000: return f"{v / 1000:.0f}k"
ax.annotate(format_label(p.get_height()),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center',
xytext=(0, 5), textcoords='offset points',
plt.title('2025事故预测', fontsize=14, pad=15)
plt.savefig(img_buffer, dpi=120)
ws_visual.add_image(Image(img_buffer), 'A2')
plt.figure(figsize=(12, 6))
plt.plot(hours, [i * 1500 for i in [0.8, 1.2, 1.5, 1.8, 2.0, 2.1, 2.0, 1.8,
1.6, 1.5, 1.6, 1.8, 2.0, 2.2, 2.5, 2.7,
2.8, 2.7, 2.5, 2.3, 2.0, 1.7, 1.3, 0.9]],
color='#e377c2', marker='o', linestyle='--')
plt.fill_between(hours, 0, alpha=0.2)
plt.grid(True, alpha=0.3)
plt.title('小时事故趋势预测', fontsize=14)
ws_visual.add_image(Image(BytesIO()), 'A25')
logging.error(f"可视化生成失败: {str(e)}")
create_advanced_visuals()
wb.save(output_excel_path)
print(f"报告已生成: {output_excel_path}")
logging.error(f"文件保存失败: {str(e)}")