#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import re import shutil from openpyxl import load_workbook def parse_question_block(lines, question_type, question_num, score, file_title): """解析单个题目块""" question_data = { '序号': question_num, # 纯数字序号 '题干': '', '选项 A': '', '选项 B': '', '选项 C': '', '选项 D': '', '选项 E': '', '选项 F': '', '选项 G': '', '选项 H': '', '解析': '', '分数': score, '答案': '', '标签': '', # 将被题型标签和文件标题替换 '': '' # 第15列为空 } # 根据题型添加基础标签 type_tag = '' if question_type == 'A': type_tag = '基础题' elif question_type == 'B': type_tag = '进阶题' elif question_type == 'C': type_tag = '竞赛题' # 初始标签为题型标签 question_data['标签'] = type_tag # 逐行解析 for line in lines: # 检查是否是题目行 if '题目' in line and not any(line.startswith(prefix) for prefix in ['正确答案', '能力标签', '知识点标签', 'A.', 'B.', 'C.', 'D.']): question_data['题干'] = line.replace('题目:', '').replace('题目', '').strip() # 检查选项 elif line.startswith('A.'): question_data['选项 A'] = line[2:].strip() elif line.startswith('B.'): question_data['选项 B'] = line[2:].strip() elif line.startswith('C.'): question_data['选项 C'] = line[2:].strip() elif line.startswith('D.'): question_data['选项 D'] = line[2:].strip() # 检查答案 elif line.startswith('正确答案:'): question_data['答案'] = line.replace('正确答案:', '').strip() # 检查标签,与题型标签和文件标题合并,用空格分割 elif '知识点标签' in line: ability_tag = line.replace('知识点标签:', '').strip() # 合并题型标签、文件标题和能力标签,最多3个 tags = [type_tag, file_title] if ability_tag: tags.append(ability_tag) question_data['标签'] = ' '.join(tags) elif '能力标签' in line: ability_tag = line.replace('能力标签:', '').strip() # 合并题型标签、文件标题和能力标签,最多3个 tags = [type_tag, file_title] if ability_tag: tags.append(ability_tag) question_data['标签'] = ' '.join(tags) # 如果题干为空,尝试从第一行获取 if not question_data['题干'] and lines: first_line = lines[0] if not any(first_line.startswith(prefix) for prefix in ['A.', 'B.', 'C.', 'D.', '正确答案', '能力标签', '知识点标签']): question_data['题干'] = first_line return question_data def parse_question_file(file_path): """解析问题文件,提取题目信息""" # 从文件名提取标题 filename = os.path.basename(file_path) file_title = filename.replace('.txt', '').replace('_', ' ') questions = { '基础题': [], '进阶题': [], '竞赛题': [] } # 尝试多种编码 encodings = ['gbk', 'utf-8', 'gb2312', 'latin-1'] content = None for encoding in encodings: try: with open(file_path, 'r', encoding=encoding) as f: content = f.read() break except UnicodeDecodeError: continue if content is None: print(f"无法读取文件: {file_path}") return questions, file_title # 提取分值信息 score_pattern = r'每题(\d+)分' scores = re.findall(score_pattern, content) # 获取所有题目块 a_blocks = re.split(r'\n*A\d+\.', content) b_blocks = re.split(r'\n*B\d+\.', content) if len(scores) > 1 else [] c_blocks = re.split(r'\n*C\d+\.', content) if len(scores) > 2 else [] # 处理A类题目(基础题) for i, block in enumerate(a_blocks[1:], 1): # 跳过第一个空块 lines = [line.strip() for line in block.strip().split('\n') if line.strip()] if len(lines) < 5: # 至少需要题目、4个选项、答案 continue score = scores[0] if len(scores) > 0 else '5' question_data = parse_question_block(lines, 'A', i, score, file_title) if question_data['题干']: questions['基础题'].append(question_data) # 处理B类题目(进阶题) for i, block in enumerate(b_blocks[1:], 1): # 跳过第一个空块 lines = [line.strip() for line in block.strip().split('\n') if line.strip()] if len(lines) < 5: # 至少需要题目、4个选项、答案 continue score = scores[1] if len(scores) > 1 else '10' question_data = parse_question_block(lines, 'B', i, score, file_title) if question_data['题干']: questions['进阶题'].append(question_data) # 处理C类题目(竞赛题) for i, block in enumerate(c_blocks[1:], 1): # 跳过第一个空块 lines = [line.strip() for line in block.strip().split('\n') if line.strip()] if len(lines) < 5: # 至少需要题目、4个选项、答案 continue score = scores[2] if len(scores) > 2 else '15' question_data = parse_question_block(lines, 'C', i, score, file_title) if question_data['题干']: questions['竞赛题'].append(question_data) return questions, file_title def create_merged_excel(all_questions, template_path, output_file): """创建合并的Excel文件""" # 复制模板文件 shutil.copy2(template_path, output_file) # 打开复制的文件 wb = load_workbook(output_file) ws = wb['单选题'] # 字段顺序 single_choice_fields = [ '序号', '题干', '选项 A', '选项 B', '选项 C', '选项 D', '选项 E', '选项 F', '选项 G', '选项 H', '解析', '分数', '答案', '标签', '' ] # 清除现有数据(从第2行开始,但保留格式) max_row = ws.max_row for row in range(2, max_row + 1): for col in range(1, 16): # 确保15列都被清理 ws.cell(row=row, column=col, value=None) # 写入所有合并的题目数据 print(f"写入 {len(all_questions)} 道题目到合并文件...") for row_idx, question in enumerate(all_questions, 2): # 重新设置序号为连续数字 question['序号'] = row_idx - 1 for col, field_name in enumerate(single_choice_fields, 1): value = question.get(field_name, '') # 确保空字符串而不是None if value is None: value = '' # 写入数据,保持原有格式 ws.cell(row=row_idx, column=col, value=value) # 保存文件 wb.save(output_file) print(f"合并文件已保存: {output_file}") def main(): """主函数:合并所有问题文件""" questions_dir = '/Users/moshui/Documents/survey/questions' output_dir = '/Users/moshui/Documents/survey/output' template_path = '/Users/moshui/Documents/survey/1.xlsx' # 使用可导入的文件作为模板 # 检查模板文件是否存在 if not os.path.exists(template_path): print(f"模板文件不存在: {template_path}") return # 创建输出目录 os.makedirs(output_dir, exist_ok=True) print("开始合并所有题库文件...") print("=" * 50) all_questions = [] file_info = [] # 处理每个txt文件 for txt_file in sorted(os.listdir(questions_dir)): if txt_file.endswith('.txt'): txt_path = os.path.join(questions_dir, txt_file) print(f"正在处理: {txt_file}") # 解析问题 questions, file_title = parse_question_file(txt_path) total_questions = sum(len(q) for q in questions.values()) print(f" 文件标题: {file_title}") print(f" 找到 {total_questions} 道题目") for qtype, qlist in questions.items(): if qlist: print(f" {qtype}: {len(qlist)} 道") if total_questions > 0: # 合并所有题型 for question_type in ['基础题', '进阶题', '竞赛题']: all_questions.extend(questions[question_type]) file_info.append({ 'filename': txt_file, 'title': file_title, 'total': total_questions }) print() # 创建合并的Excel文件 if all_questions: output_file = os.path.join(output_dir, '合并题库.xlsx') create_merged_excel(all_questions, template_path, output_file) print("=" * 50) print(f"合并完成!") print(f"总题目数: {len(all_questions)}") print(f"涉及文件数: {len(file_info)}") print(f"输出文件: {output_file}") print() print("文件清单:") for info in file_info: print(f" - {info['filename']}: {info['title']} ({info['total']}题)") # 统计标签分布 print() print("标签统计:") tag_count = {} for q in all_questions: tag = q.get('标签', '') if tag: tag_count[tag] = tag_count.get(tag, 0) + 1 # 显示前10个最常见标签 sorted_tags = sorted(tag_count.items(), key=lambda x: x[1], reverse=True) print("最常见的标签:") for tag, count in sorted_tags[:10]: print(f" {tag}: {count}题") else: print("没有找到任何有效题目!") if __name__ == "__main__": main()