从指定某个v2ex帖子下载所有的评论

ytkz2026-03-092026-03-09
#!/usr/bin/env python
# -*- coding: utf-8 -*- 
# @Time : 2026/3/9 14:53 
# @File : python v2ex_download_comments.py 
'''
从指定某个v2ex下载所有的评论
'''

# v2ex_download_text.py
# 功能：从指定 V2EX 帖子下载所有评论（包括楼主），保存为纯文本文件
# 依赖：requests, beautifulsoup4
# 安装：pip install requests beautifulsoup4
# 使用方法：python v2ex_download_text.py
# 注意：V2EX 有反爬机制，建议不要高频请求；本脚本仅用于个人学习/备份

import requests
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import urljoin

import re


def extract_users_from_text(text):
    """
    从文本中提取用户名和时间信息

    参数:
        text (str): 包含用户评论的文本

    返回:
        list: 包含字典的列表，每个字典包含用户名和时间信息
    """
    # 定义匹配模式
    # 匹配用户名（非空字符，至少2个字符）和时间（X小时Y分钟前）
    pattern = r'(\S{2,})\s+(\d+\s+小时\s+\d+\s+分钟前)'

    # 查找所有匹配项
    matches = re.findall(pattern, text)

    # 将匹配结果转换为字典列表
    users = []
    for match in matches:
        username, time_info = match
        users.append({
            'username': username,
            'time': time_info
        })

    return users
def get_page_content(url, headers=None):
    """获取单页HTML内容"""
    if headers is None:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                          '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }

    try:
        resp = requests.get(url, headers=headers, timeout=10)
        resp.raise_for_status()
        return resp.text
    except requests.RequestException as e:
        print(f"请求失败: {url} → {e}")
        return None


def parse_comments(html):
    """解析单页的评论内容"""
    if not html:
        return [], None  # 返回空列表和无下一页

    soup = BeautifulSoup(html, 'html.parser')

    # V2EX 回复的容器 class 通常是 'cell' + 'reply_area' 或直接在 #Main 下 .cell
    reply_cells = soup.select('#Main .cell[id^="r_"]')  # 回复的 id 通常是 r_xxxxxx

    comments = []

    for cell in reply_cells:
        # 提取用户名
        username_tag = extract_users_from_text(cell.text)
        username = username_tag[0]["username"] if username_tag else "未知用户"

        # 提取回复时间
        time_tag = cell.select_one('.ago')
        reply_time = time_tag.get_text(strip=True) if time_tag else ""

        # 提取回复内容（支持 markdown 和代码块）
        content_div = cell.select_one('.reply_content')
        content = ""
        if content_div:
            # 保留原始文本结构，处理多行
            content = content_div.get_text(separator="\n", strip=True)
            # 或者更干净一点：content = '\n'.join(line.strip() for line in content_div.stripped_strings)

        if content:
            comment_block = f"@{username}  {reply_time}\n{content}\n{'-' * 60}\n"
            comments.append(comment_block)

    # 查找下一页链接
    next_page = None
    pager = soup.select_one('#Main .page_current + a.page_normal')
    if pager and 'href' in pager.attrs:
        next_href = pager['href']
        if next_href.startswith('/t/'):
            next_page = urljoin("https://www.v2ex.com", next_href)

    return comments, next_page


def download_all_replies(topic_url, output_file="v2ex_replies.txt"):
    """下载帖子所有评论并保存到文件"""
    all_comments = []
    current_url = topic_url
    page_num = 1

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    print(f"开始抓取帖子：{topic_url}")

    while current_url:
        print(f"  正在处理第 {page_num} 页...")
        html = get_page_content(current_url, headers)
        if not html:
            break

        page_comments, next_url = parse_comments(html)
        all_comments.extend(page_comments)

        if not next_url:
            break

        current_url = next_url
        page_num += 1

        # 礼貌爬取：每页间隔 2–4 秒
        time.sleep(2.5)

    if not all_comments:
        print("未抓取到任何评论，可能被反爬或页面结构变化。")
        return

    # 保存到文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"帖子地址：{topic_url}\n")
        f.write(f"总共抓取到 {len(all_comments)} 条回复\n")
        f.write("=" * 70 + "\n\n")
        f.writelines(all_comments)

    print(f"\n抓取完成！共 {len(all_comments)} 条评论，已保存至：{output_file}")


if __name__ == '__main__':
    url = r'https://www.v2ex.com/t/1196704'

    # 你可以修改文件名或路径
    download_all_replies(url, output_file="v2ex_comments_1196704.txt")