survey/generate/merge_questions.py
朱潮 99796408cf Initial commit: Add survey system with enhanced features
- Complete survey management system with web interface
- Question generation tools and prompts
- Report generation and analysis capabilities
- Docker configuration for deployment
- Database initialization scripts

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-28 20:28:57 +08:00

277 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import shutil
from openpyxl import load_workbook
def parse_question_block(lines, question_type, question_num, score, file_title):
"""解析单个题目块"""
question_data = {
'序号': question_num, # 纯数字序号
'题干': '',
'选项 A': '',
'选项 B': '',
'选项 C': '',
'选项 D': '',
'选项 E': '',
'选项 F': '',
'选项 G': '',
'选项 H': '',
'解析': '',
'分数': score,
'答案': '',
'标签': '', # 将被题型标签和文件标题替换
'': '' # 第15列为空
}
# 根据题型添加基础标签
type_tag = ''
if question_type == 'A':
type_tag = '基础题'
elif question_type == 'B':
type_tag = '进阶题'
elif question_type == 'C':
type_tag = '竞赛题'
# 初始标签为题型标签
question_data['标签'] = type_tag
# 逐行解析
for line in lines:
# 检查是否是题目行
if '题目' in line and not any(line.startswith(prefix) for prefix in ['正确答案', '能力标签', '知识点标签', 'A.', 'B.', 'C.', 'D.']):
question_data['题干'] = line.replace('题目:', '').replace('题目', '').strip()
# 检查选项
elif line.startswith('A.'):
question_data['选项 A'] = line[2:].strip()
elif line.startswith('B.'):
question_data['选项 B'] = line[2:].strip()
elif line.startswith('C.'):
question_data['选项 C'] = line[2:].strip()
elif line.startswith('D.'):
question_data['选项 D'] = line[2:].strip()
# 检查答案
elif line.startswith('正确答案:'):
question_data['答案'] = line.replace('正确答案:', '').strip()
# 检查标签,与题型标签和文件标题合并,用空格分割
elif '知识点标签' in line:
ability_tag = line.replace('知识点标签:', '').strip()
# 合并题型标签、文件标题和能力标签最多3个
tags = [type_tag, file_title]
if ability_tag:
tags.append(ability_tag)
question_data['标签'] = ' '.join(tags)
elif '能力标签' in line:
ability_tag = line.replace('能力标签:', '').strip()
# 合并题型标签、文件标题和能力标签最多3个
tags = [type_tag, file_title]
if ability_tag:
tags.append(ability_tag)
question_data['标签'] = ' '.join(tags)
# 如果题干为空,尝试从第一行获取
if not question_data['题干'] and lines:
first_line = lines[0]
if not any(first_line.startswith(prefix) for prefix in ['A.', 'B.', 'C.', 'D.', '正确答案', '能力标签', '知识点标签']):
question_data['题干'] = first_line
return question_data
def parse_question_file(file_path):
"""解析问题文件,提取题目信息"""
# 从文件名提取标题
filename = os.path.basename(file_path)
file_title = filename.replace('.txt', '').replace('_', ' ')
questions = {
'基础题': [],
'进阶题': [],
'竞赛题': []
}
# 尝试多种编码
encodings = ['gbk', 'utf-8', 'gb2312', 'latin-1']
content = None
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
content = f.read()
break
except UnicodeDecodeError:
continue
if content is None:
print(f"无法读取文件: {file_path}")
return questions, file_title
# 提取分值信息
score_pattern = r'每题(\d+)分'
scores = re.findall(score_pattern, content)
# 获取所有题目块
a_blocks = re.split(r'\n*A\d+\.', content)
b_blocks = re.split(r'\n*B\d+\.', content) if len(scores) > 1 else []
c_blocks = re.split(r'\n*C\d+\.', content) if len(scores) > 2 else []
# 处理A类题目基础题
for i, block in enumerate(a_blocks[1:], 1): # 跳过第一个空块
lines = [line.strip() for line in block.strip().split('\n') if line.strip()]
if len(lines) < 5: # 至少需要题目、4个选项、答案
continue
score = scores[0] if len(scores) > 0 else '5'
question_data = parse_question_block(lines, 'A', i, score, file_title)
if question_data['题干']:
questions['基础题'].append(question_data)
# 处理B类题目进阶题
for i, block in enumerate(b_blocks[1:], 1): # 跳过第一个空块
lines = [line.strip() for line in block.strip().split('\n') if line.strip()]
if len(lines) < 5: # 至少需要题目、4个选项、答案
continue
score = scores[1] if len(scores) > 1 else '10'
question_data = parse_question_block(lines, 'B', i, score, file_title)
if question_data['题干']:
questions['进阶题'].append(question_data)
# 处理C类题目竞赛题
for i, block in enumerate(c_blocks[1:], 1): # 跳过第一个空块
lines = [line.strip() for line in block.strip().split('\n') if line.strip()]
if len(lines) < 5: # 至少需要题目、4个选项、答案
continue
score = scores[2] if len(scores) > 2 else '15'
question_data = parse_question_block(lines, 'C', i, score, file_title)
if question_data['题干']:
questions['竞赛题'].append(question_data)
return questions, file_title
def create_merged_excel(all_questions, template_path, output_file):
"""创建合并的Excel文件"""
# 复制模板文件
shutil.copy2(template_path, output_file)
# 打开复制的文件
wb = load_workbook(output_file)
ws = wb['单选题']
# 字段顺序
single_choice_fields = [
'序号', '题干', '选项 A', '选项 B', '选项 C', '选项 D',
'选项 E', '选项 F', '选项 G', '选项 H', '解析', '分数', '答案', '标签', ''
]
# 清除现有数据从第2行开始但保留格式
max_row = ws.max_row
for row in range(2, max_row + 1):
for col in range(1, 16): # 确保15列都被清理
ws.cell(row=row, column=col, value=None)
# 写入所有合并的题目数据
print(f"写入 {len(all_questions)} 道题目到合并文件...")
for row_idx, question in enumerate(all_questions, 2):
# 重新设置序号为连续数字
question['序号'] = row_idx - 1
for col, field_name in enumerate(single_choice_fields, 1):
value = question.get(field_name, '')
# 确保空字符串而不是None
if value is None:
value = ''
# 写入数据,保持原有格式
ws.cell(row=row_idx, column=col, value=value)
# 保存文件
wb.save(output_file)
print(f"合并文件已保存: {output_file}")
def main():
"""主函数:合并所有问题文件"""
questions_dir = '/Users/moshui/Documents/survey/questions'
output_dir = '/Users/moshui/Documents/survey/output'
template_path = '/Users/moshui/Documents/survey/1.xlsx' # 使用可导入的文件作为模板
# 检查模板文件是否存在
if not os.path.exists(template_path):
print(f"模板文件不存在: {template_path}")
return
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
print("开始合并所有题库文件...")
print("=" * 50)
all_questions = []
file_info = []
# 处理每个txt文件
for txt_file in sorted(os.listdir(questions_dir)):
if txt_file.endswith('.txt'):
txt_path = os.path.join(questions_dir, txt_file)
print(f"正在处理: {txt_file}")
# 解析问题
questions, file_title = parse_question_file(txt_path)
total_questions = sum(len(q) for q in questions.values())
print(f" 文件标题: {file_title}")
print(f" 找到 {total_questions} 道题目")
for qtype, qlist in questions.items():
if qlist:
print(f" {qtype}: {len(qlist)}")
if total_questions > 0:
# 合并所有题型
for question_type in ['基础题', '进阶题', '竞赛题']:
all_questions.extend(questions[question_type])
file_info.append({
'filename': txt_file,
'title': file_title,
'total': total_questions
})
print()
# 创建合并的Excel文件
if all_questions:
output_file = os.path.join(output_dir, '合并题库.xlsx')
create_merged_excel(all_questions, template_path, output_file)
print("=" * 50)
print(f"合并完成!")
print(f"总题目数: {len(all_questions)}")
print(f"涉及文件数: {len(file_info)}")
print(f"输出文件: {output_file}")
print()
print("文件清单:")
for info in file_info:
print(f" - {info['filename']}: {info['title']} ({info['total']}题)")
# 统计标签分布
print()
print("标签统计:")
tag_count = {}
for q in all_questions:
tag = q.get('标签', '')
if tag:
tag_count[tag] = tag_count.get(tag, 0) + 1
# 显示前10个最常见标签
sorted_tags = sorted(tag_count.items(), key=lambda x: x[1], reverse=True)
print("最常见的标签:")
for tag, count in sorted_tags[:10]:
print(f" {tag}: {count}")
else:
print("没有找到任何有效题目!")
if __name__ == "__main__":
main()