survey/generate/convert_questions_with_excel_fix.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import shutil
from openpyxl import load_workbook
import subprocess
import sys

def parse_question_block(lines, question_type, question_num, score):
    """解析单个题目块"""
    question_data = {
        '序号': question_num,  # 纯数字序号
        '题干': '',
        '选项 A': '',
        '选项 B': '',
        '选项 C': '',
        '选项  D': '',
        '选项 E': '',
        '选项 F': '',
        '选项 G': '',
        '选项 H': '',
        '解析': '',
        '分数': score,
        '答案': '',
        '标签': '',  # 会被题型标签替换
        '': ''  # 第15列为空
    }

    # 根据题型添加基础标签
    if question_type == 'A':
        question_data['标签'] = '基础题'
    elif question_type == 'B':
        question_data['标签'] = '进阶题'
    elif question_type == 'C':
        question_data['标签'] = '竞赛题'

    # 逐行解析
    for line in lines:
        # 检查是否是题目行
        if '题目' in line and not any(line.startswith(prefix) for prefix in ['正确答案', '能力标签', '知识点标签', 'A.', 'B.', 'C.', 'D.']):
            question_data['题干'] = line.replace('题目：', '').replace('题目', '').strip()
        # 检查选项
        elif line.startswith('A.'):
            question_data['选项 A'] = line[2:].strip()
        elif line.startswith('B.'):
            question_data['选项 B'] = line[2:].strip()
        elif line.startswith('C.'):
            question_data['选项 C'] = line[2:].strip()
        elif line.startswith('D.'):
            question_data['选项  D'] = line[2:].strip()
        # 检查答案
        elif line.startswith('正确答案：'):
            question_data['答案'] = line.replace('正确答案：', '').strip()
        # 检查标签，与题型标签合并，用空格分割，最多2个标签
        elif '知识点标签' in line:
            ability_tag = line.replace('知识点标签：', '').strip()
            # 合并题型标签和能力标签，用空格分割
            if ability_tag and question_data['标签']:
                question_data['标签'] = f"{question_data['标签']} {ability_tag}"
            elif ability_tag:
                question_data['标签'] = ability_tag
        elif '能力标签' in line:
            ability_tag = line.replace('能力标签：', '').strip()
            # 合并题型标签和能力标签，用空格分割
            if ability_tag and question_data['标签']:
                question_data['标签'] = f"{question_data['标签']} {ability_tag}"
            elif ability_tag:
                question_data['标签'] = ability_tag

    # 如果题干为空，尝试从第一行获取
    if not question_data['题干'] and lines:
        first_line = lines[0]
        if not any(first_line.startswith(prefix) for prefix in ['A.', 'B.', 'C.', 'D.', '正确答案', '能力标签', '知识点标签']):
            question_data['题干'] = first_line

    return question_data

def parse_question_file(file_path):
    """解析问题文件，提取题目信息"""
    questions = {
        '基础题': [],
        '进阶题': [],
        '竞赛题': []
    }

    # 尝试多种编码
    encodings = ['gbk', 'utf-8', 'gb2312', 'latin-1']
    content = None

    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                content = f.read()
            break
        except UnicodeDecodeError:
            continue

    if content is None:
        print(f"无法读取文件: {file_path}")
        return questions

    # 提取分值信息
    score_pattern = r'每题(\d+)分'
    scores = re.findall(score_pattern, content)

    # 获取所有题目块
    a_blocks = re.split(r'\n*A\d+\.', content)
    b_blocks = re.split(r'\n*B\d+\.', content) if len(scores) > 1 else []
    c_blocks = re.split(r'\n*C\d+\.', content) if len(scores) > 2 else []

    # 处理A类题目（基础题）
    for i, block in enumerate(a_blocks[1:], 1):  # 跳过第一个空块
        lines = [line.strip() for line in block.strip().split('\n') if line.strip()]
        if len(lines) < 5:  # 至少需要题目、4个选项、答案
            continue

        score = scores[0] if len(scores) > 0 else '5'
        question_data = parse_question_block(lines, 'A', i, score)
        if question_data['题干']:
            questions['基础题'].append(question_data)

    # 处理B类题目（进阶题）
    for i, block in enumerate(b_blocks[1:], 1):  # 跳过第一个空块
        lines = [line.strip() for line in block.strip().split('\n') if line.strip()]
        if len(lines) < 5:  # 至少需要题目、4个选项、答案
            continue

        score = scores[1] if len(scores) > 1 else '10'
        question_data = parse_question_block(lines, 'B', i, score)
        if question_data['题干']:
            questions['进阶题'].append(question_data)

    # 处理C类题目（竞赛题）
    for i, block in enumerate(c_blocks[1:], 1):  # 跳过第一个空块
        lines = [line.strip() for line in block.strip().split('\n') if line.strip()]
        if len(lines) < 5:  # 至少需要题目、4个选项、答案
            continue

        score = scores[2] if len(scores) > 2 else '15'
        question_data = parse_question_block(lines, 'C', i, score)
        if question_data['题干']:
            questions['竞赛题'].append(question_data)

    return questions

def excel_resave(file_path):
    """使用Excel重新保存文件以修复格式问题"""
    try:
        if sys.platform == 'darwin':  # macOS
            # 尝试使用AppleScript打开并保存Excel文件
            script = f'''
            tell application "Microsoft Excel"
                open POSIX file "{file_path}"
                activate
                delay 2
                save active workbook
                quit saving yes
            end tell
            '''

            result = subprocess.run(['osascript', '-e', script],
                                   capture_output=True, text=True, timeout=30)

            if result.returncode == 0:
                print(f"  Excel重新保存成功: {os.path.basename(file_path)}")
                return True
            else:
                print(f"  Excel重新保存失败: {result.stderr}")
                return False

        elif sys.platform == 'win32':  # Windows
            # Windows下可以使用pywin32或其他方法
            print("  Windows系统暂不支持自动Excel重新保存")
            return False

    except Exception as e:
        print(f"  Excel重新保存出错: {e}")
        return False

    return False

def create_excel_with_format_fix(questions, template_path, output_file):
    """创建Excel文件并修复格式问题"""
    # 复制模板文件
    shutil.copy2(template_path, output_file)

    # 打开复制的文件
    wb = load_workbook(output_file)
    ws = wb['单选题']

    # 字段顺序
    single_choice_fields = [
        '序号', '题干', '选项 A', '选项 B', '选项 C', '选项  D',
        '选项 E', '选项 F', '选项 G', '选项 H', '解析', '分数', '答案', '标签', ''
    ]

    # 合并所有题目
    all_questions = []
    for question_type in ['基础题', '进阶题', '竞赛题']:
        all_questions.extend(questions[question_type])

    if all_questions:
        # 清除现有数据（从第2行开始，但保留格式）
        max_row = ws.max_row
        for row in range(2, max_row + 1):
            for col in range(1, 16):  # 确保15列都被清理
                ws.cell(row=row, column=col, value=None)

        # 写入新数据，重新编号为连续的纯数字
        for row_idx, question in enumerate(all_questions, 2):
            # 重新设置序号为连续数字
            question['序号'] = row_idx - 1

            for col, field_name in enumerate(single_choice_fields, 1):
                value = question.get(field_name, '')
                # 确保空字符串而不是None
                if value is None:
                    value = ''

                # 写入数据，保持原有格式
                ws.cell(row=row_idx, column=col, value=value)

    # 保存文件
    wb.save(output_file)

    # 尝试用Excel重新保存以修复格式
    print("  尝试用Excel重新保存...")
    excel_resave(output_file)

def main():
    """主函数：处理所有问题文件"""
    questions_dir = '/Users/moshui/Documents/survey/questions'
    output_dir = '/Users/moshui/Documents/survey/output'
    template_path = '/Users/moshui/Documents/survey/1.xlsx'  # 使用可导入的文件作为模板

    # 检查模板文件是否存在
    if not os.path.exists(template_path):
        print(f"模板文件不存在: {template_path}")
        return

    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    print("开始转换文件...")
    print("注意：此脚本会尝试自动用Excel重新保存文件以修复格式问题")
    print("如果自动重新保存失败，请手动用Excel打开并保存文件")
    print()

    # 处理每个txt文件
    for txt_file in os.listdir(questions_dir):
        if txt_file.endswith('.txt'):
            txt_path = os.path.join(questions_dir, txt_file)
            print(f"正在处理: {txt_file}")

            # 解析问题
            questions = parse_question_file(txt_path)

            total_questions = sum(len(q) for q in questions.values())
            print(f"  找到 {total_questions} 道题目")
            for qtype, qlist in questions.items():
                if qlist:
                    print(f"    {qtype}: {len(qlist)} 道")

            if total_questions > 0:
                # 生成输出文件名
                base_name = txt_file.replace('.txt', '.xlsx')
                output_file = os.path.join(output_dir, base_name)

                # 创建Excel文件并修复格式
                create_excel_with_format_fix(questions, template_path, output_file)
                print(f"  已保存到: {output_file}")
            else:
                print(f"  未找到有效题目")
            print()

if __name__ == "__main__":
    main()