如何使用Python从PDF中提取文本并转换为Markdown文本的实操

ytkz2024-07-262024-09-09

之前实现了对注册测绘师综合真题PDF转换为文本数据。但是这些文本数据，不能直接导入到数据库中的。

而本文针对这个问题，进行展开。最终实现注册测绘师真题文本处理与答案高亮。

数据结构

首先理清楚数据结构。比如以下这3道题：

1.使用N台（N＞3）GPS接收机进行同步观测所获得的GPS边中，独立的GPS边的数量是
（ ）。
A、N B、N-1 C、N(N+1)/2 D、N(N-1)/2
答案：B

2.我国现行的大地原点、水准原点分别位于（ ）。
A、北京、浙江坎门 B、北京、山东青岛 C、山西泾阳、浙江坎门 D、陕西泾阳、山东青岛
答案：D

3.大地水准面精化工作中，A、B级GPS观测应采用（ ）定位模式。
A、静态相对 B、快速静态相对 C、准动态相对 D、绝对
答案：A

需要把以上的文本分为

1.题目

2.选项

3.答案

对于文字的处理，我们通常用到正则式re。re是一个非常强大的库，它又简单又困难。简单是指它容易使用；困难是指如果要实现指定功能，它就变得很困难。

反正通过re，我们可以实现以上需求。

def parse_text(text):
    '''
    用于2011年到2021年测绘真题
    :param text: 文本
    :return: 题目选项答案的列表
    '''
    text = text.replace('、', '.')
    # 正则表达式，用于分割每个问题的段落
    problem_pattern = re.compile(r'(\d+\..*?(?=^(\d+\.|$)))', re.MULTILINE | re.DOTALL)

    # 正则表达式，用于在每个问题的段落中提取题目、选项、答案和解析
    all_pattern =            re.compile(r'^(.*?)\s*(A[.,\、]*.*?E[.,\、]*.*?)(答案：[\w, ]+)(解析：.*)', re.DOTALL)
    no_explanation_pattern = re.compile(r'^(.*?)\s*(A[.,\、]*.*?D[.,\、]*.*?)(答案：[\w, ]+)', re.DOTALL)

    problems = problem_pattern.findall(text)
    parsed_problems = []
    i = 0
    for problem in problems:
        problem = list(problem)[0]
        match_all = all_pattern.match(problem)
        match_no_explanation = no_explanation_pattern.match(problem)

        match = match_all if match_all else match_no_explanation
        i+=1
        if i == 30:
            print()
        if match:
            question = match.group(1).strip()

            #  修改后的正则表达式，确保选项前有空格或\n，并且紧跟一个大写字母和“、”
            origin_option = match.group(2).replace('\n', ' ').strip()
            options_pattern = re.compile(r'(?<=\s|\n)[A-F][.,\、]*\s*[^答案]+')
            options_pattern = re.compile(r'(?<=\s|\n)[A-F][.,\、]\s*\S.*?(?= [A-F]|$)')
            options_pattern = re.compile(r'[A-F][.,\、]*\s*\S.*?(?=\S[A-F]|$)')

            options_matches = options_pattern.findall(origin_option)

            options = [option.strip() for option_match in options_matches for option in option_match.split()]

            answer = match.group(3).replace('答案：', '').strip().replace(' ', '')
            explanation = match.group(4).strip() if match_all else ''

            parsed_problems.append({
                'question': question.replace('\n',''),
                'options': options,
                'answer': answer,
                'explanation': explanation
            })

    return parsed_problems

测试如下：

[{'question': '1.使用N台（N＞3）GPS接收机进行同步观测所获得的GPS边中，独立的GPS边的数量是（ ）。', 'options': ['A.N', 'B.N-1', 'C.N(N+1)/2', 'D.N(N-1)/2'], 'answer': 'B', 'explanation': ''}, 
 {'question': '2.我国现行的大地原点.水准原点分别位于（ ）。', 'options': ['A.北京.浙江坎门', 'B.北京.山东青岛', 'C.山西泾阳.浙江坎门', 'D.陕西泾阳.山东青岛'], 'answer': 'D', 'explanation': ''},
 {'question': '3.大地水准面精化工作中，', 'options': ['A.B级GPS观测应采用（', '）定位模式。', 'A.静态相对', 'B.快速静态相对', 'C.准动态相对', 'D.绝对'], 'answer': 'A', 'explanation': ''}]

给选项附上颜色

没附上颜色前的文本如下：

附上颜色前的文本如下

需求分析

1.把纯文本进行数据分类

2.按照正确答案，给选项附上颜色

3.批量处理2011年到2022年的注册测绘师综合真题

下面附上python代码，对相关知识点感兴趣的话，自行看下面的代码。

全部代码

#!/usr/bin/env python
# -*- coding: utf-8 -*- 
# @File : highlight_correct_answers.py 
'''

'''
import os
from read_pdf import get_file_name
import time
import re
import  random

index = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 8
}


def generate_random_color():
    r = random.randint(0, 255)
    g = random.randint(0, 255)
    b = random.randint(0, 255)
    return f'#{r:02x}{g:02x}{b:02x}'


colors = [generate_random_color() for _ in range(100)]  # 生成100种随机颜色
def extract_options(options_str):
    # 假设options_str的格式为'A.N,B.N-1,C.N(N+1)/2,D.N(N-1)/2,E.其他内容'
    # 如果没有E，则只处理到D
    options_list = options_str.rstrip().split(' ')

    # 初始化一个字典来存储选项
    options_dict = {}

    # 遍历选项列表
    for option in options_list:
        # 分割选项为字母和内容
        letter, content = option.split('.', 1)
        options_dict[letter.strip()] = content.strip()

        # 如果没有E，则只取A, B, C, D
    # 如果有E，则直接返回所有选项
    if 'E' in options_dict:
        # 如果需要，可以在这里对options_dict进行进一步处理或返回
        return options_dict
    else:
        # 提取A, B, C, D
        return {k: v for k, v in options_dict.items() if k in ['A', 'B', 'C', 'D']}

def read_md_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            return content
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到。")
        return None
def split_text(text):

    # 正则表达式用于匹配题目、选项、答案和解析
    pattern = r'(?P<question>\d+\..*?\n(?:A|B|C|D).*\n)答案：(?P<answer>[A-D])\n解析：(?P<explanation>.*?(?=\n\d+\.|$))'
    questions = []

    # 使用re.finditer来查找所有匹配项
    for match in re.finditer(pattern, text, re.MULTILINE | re.DOTALL):
        question = match.group('question').strip()
        options = re.findall(r'(A|B|C|D)\..*', question)  # 提取选项（简化处理，可能需要根据实际情况调整）
        answer = match.group('answer')
        explanation = match.group('explanation').strip()

        # 将题目、选项、答案和解析添加到列表中（这里为了简单起见，只添加了题目和答案，但可以扩展为包含选项和解析的字典）
        questions.append({
            'question': question,
            'options': options,
            'answer': answer,
            'explanation': explanation
        })
    return questions

def parse_text(text):
    '''
    用于2011年到2021年测绘真题
    :param text: 文本
    :return: 题目选项答案的列表
    '''
    text = text.replace('、', '.')
    # 正则表达式，用于分割每个问题的段落
    problem_pattern = re.compile(r'(\d+\..*?(?=^(\d+\.|$)))', re.MULTILINE | re.DOTALL)

    # 正则表达式，用于在每个问题的段落中提取题目、选项、答案和解析
    all_pattern =            re.compile(r'^(.*?)\s*(A[.,\、]*.*?E[.,\、]*.*?)(答案：[\w, ]+)(解析：.*)', re.DOTALL)
    no_explanation_pattern = re.compile(r'^(.*?)\s*(A[.,\、]*.*?D[.,\、]*.*?)(答案：[\w, ]+)', re.DOTALL)

    problems = problem_pattern.findall(text)
    parsed_problems = []
    i = 0
    for problem in problems:
        problem = list(problem)[0]
        match_all = all_pattern.match(problem)
        match_no_explanation = no_explanation_pattern.match(problem)

        match = match_all if match_all else match_no_explanation
        i+=1
        if i == 26:
            print()
        if match:
            question = match.group(1).strip()

            #  修改后的正则表达式，确保选项前有空格或\n，并且紧跟一个大写字母和“、”
            origin_option = match.group(2).replace('\n', ' ').strip()
            options_pattern = re.compile(r'[A-F][.,\、]*\s*\S.*?(?=\S[A-F]|$)')

            options_matches = options_pattern.findall(origin_option)

            options = [option.strip() for option_match in options_matches for option in option_match.split()]

            answer = match.group(3).replace('答案：', '').strip().replace(' ', '')
            explanation = match.group(4).strip() if match_all else ''

            parsed_problems.append({
                'question': question.replace('\n',''),
                'options': options,
                'answer': answer,
                'explanation': explanation
            })

    return parsed_problems
def partition_topic(text):
    '''
    用于2011年到2021年测绘真题
    :param text: 文本
    :return: 题目选项答案的列表
    '''
    questions = re.findall( r'(\d+\..*?答案：.*?解析：.*?)(?=^\d+\.|$)', text, re.DOTALL | re.MULTILINE)
    return questions

def main(path, outpath):
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    mdfile = get_file_name(path, '.md')

    i = 0
    for md in mdfile:
        outfile = os.path.join(outpath, os.path.splitext(os.path.basename(md))[0] + '.md')
        if os.path.exists(outfile):
            os.remove(outfile)  # 如果存在，则删除，再生成新的outfile

        md_content = read_md_file(md)  # 读取
        parsed_text = parse_text(md_content)  # 2011-2022
        # parsed_text = parse_text2022(md_content)

        try:
            for text in parsed_text:
                questions = text['question']
                options = text['options']
                answers = text['answer']
                explanation = text['explanation']
                for answer in answers:
                    options[index[
                        answer]] = f'**<span style="color:{random.choice(colors)};">{options[index[answer]]}</span>**'

                with open(outfile, 'a', encoding='utf-8') as file:

                    new_options = []
                    file.write(questions)
                    file.write('<br />')
                    for option in options:
                        file.write(option)
                        file.write(' ')
                    file.write('<br />')
                    file.write('答案:{}'.format(answers))
                    file.write('<br />')
                    file.write('<br />')
                    file.write('<br />')
        except  Exception as e:
            print(e)

        i += 1
        print("\r答案加粗加颜色: [{0:50s}] {1:.1f}% {2}".format('#' * int(i / (len(mdfile)) * 50),
                                                                i / len(mdfile) * 100,
                                                                os.path.basename(outfile)), end="",
              flush=True)
        time.sleep(2)


if __name__ == '__main__':
    path = r'D:\Registered_Surveyor\markdown\test2022'
    outpath = r'D:\Registered_Surveyor\markdown\color1'
    main(path, outpath)

是有瑕疵的，比如上图中的第三题就有提取信息的瑕疵。

为什么会那样呢？因为第三题的题干出现了A、B的字眼，把我们拟定的re规则打乱了。

但是这样的错误是少数的，我不想再debug上面的代码了。。。。

数据结构

给选项附上颜色

需求分析

全部代码

资料分享