自媒体自媒体从指定某个v2ex帖子下载所有的评论
ytkz
'''
从指定某个v2ex下载所有的评论
'''
import requests
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import urljoin
import re
def extract_users_from_text(text):
"""
从文本中提取用户名和时间信息
参数:
text (str): 包含用户评论的文本
返回:
list: 包含字典的列表,每个字典包含用户名和时间信息
"""
pattern = r'(\S{2,})\s+(\d+\s+小时\s+\d+\s+分钟前)'
matches = re.findall(pattern, text)
users = []
for match in matches:
username, time_info = match
users.append({
'username': username,
'time': time_info
})
return users
def get_page_content(url, headers=None):
"""获取单页HTML内容"""
if headers is None:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
try:
resp = requests.get(url, headers=headers, timeout=10)
resp.raise_for_status()
return resp.text
except requests.RequestException as e:
print(f"请求失败: {url} → {e}")
return None
def parse_comments(html):
"""解析单页的评论内容"""
if not html:
return [], None
soup = BeautifulSoup(html, 'html.parser')
reply_cells = soup.select('#Main .cell[id^="r_"]')
comments = []
for cell in reply_cells:
username_tag = extract_users_from_text(cell.text)
username = username_tag[0]["username"] if username_tag else "未知用户"
time_tag = cell.select_one('.ago')
reply_time = time_tag.get_text(strip=True) if time_tag else ""
content_div = cell.select_one('.reply_content')
content = ""
if content_div:
content = content_div.get_text(separator="\n", strip=True)
if content:
comment_block = f"@{username} {reply_time}\n{content}\n{'-' * 60}\n"
comments.append(comment_block)
next_page = None
pager = soup.select_one('#Main .page_current + a.page_normal')
if pager and 'href' in pager.attrs:
next_href = pager['href']
if next_href.startswith('/t/'):
next_page = urljoin("https://www.v2ex.com", next_href)
return comments, next_page
def download_all_replies(topic_url, output_file="v2ex_replies.txt"):
"""下载帖子所有评论并保存到文件"""
all_comments = []
current_url = topic_url
page_num = 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
print(f"开始抓取帖子:{topic_url}")
while current_url:
print(f" 正在处理第 {page_num} 页...")
html = get_page_content(current_url, headers)
if not html:
break
page_comments, next_url = parse_comments(html)
all_comments.extend(page_comments)
if not next_url:
break
current_url = next_url
page_num += 1
time.sleep(2.5)
if not all_comments:
print("未抓取到任何评论,可能被反爬或页面结构变化。")
return
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f"帖子地址:{topic_url}\n")
f.write(f"总共抓取到 {len(all_comments)} 条回复\n")
f.write("=" * 70 + "\n\n")
f.writelines(all_comments)
print(f"\n抓取完成!共 {len(all_comments)} 条评论,已保存至:{output_file}")
if __name__ == '__main__':
url = r'https://www.v2ex.com/t/1196704'
download_all_replies(url, output_file="v2ex_comments_1196704.txt")