92 lines
3.4 KiB
Python
92 lines
3.4 KiB
Python
import subprocess
|
|
import json
|
|
from datetime import datetime, timedelta
|
|
|
|
# 定义一个函数,使用 yt-dlp 提取视频元数据
|
|
def fetch_video_metadata(url):
|
|
try:
|
|
# 使用 yt-dlp 提取元数据,不下载视频
|
|
result = subprocess.run(
|
|
["yt-dlp", "--skip-download", "--dump-json", url],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
)
|
|
# 解析 JSON 数据
|
|
video_info = json.loads(result.stdout)
|
|
return video_info
|
|
except Exception as e:
|
|
print(f"提取元数据失败: {e}")
|
|
return None
|
|
|
|
# 筛选最近一年的视频
|
|
def is_recent_video(upload_date):
|
|
try:
|
|
# 上传日期格式为 YYYYMMDD
|
|
video_date = datetime.strptime(upload_date, "%Y%m%d")
|
|
one_year_ago = datetime.now() - timedelta(days=365)
|
|
return video_date >= one_year_ago
|
|
except ValueError:
|
|
return False
|
|
|
|
# 主函数:爬取一个列表页面中的视频信息
|
|
def fetch_recent_videos_from_pornhub(url):
|
|
try:
|
|
# 获取 Pornhub 列表页面的视频元数据
|
|
result = subprocess.run(
|
|
["yt-dlp", "--flat-playlist", "--dump-json", url],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
)
|
|
# 逐行解析 JSON 数据
|
|
videos = [json.loads(line) for line in result.stdout.strip().split("\n")]
|
|
|
|
# 存储符合条件的视频信息
|
|
recent_videos = []
|
|
for video in videos:
|
|
video_url = video.get("url")
|
|
print(f"正在提取视频: {video_url}")
|
|
metadata = fetch_video_metadata(video_url)
|
|
|
|
if metadata:
|
|
upload_date = metadata.get("upload_date")
|
|
if upload_date and is_recent_video(upload_date):
|
|
# 解析指标
|
|
title = metadata.get("title", "未知标题")
|
|
like_count = metadata.get("like_count", 0)
|
|
dislike_count = metadata.get("dislike_count", 0)
|
|
view_count = metadata.get("view_count", 0)
|
|
comment_count = metadata.get("comment_count", 0)
|
|
|
|
video_data = {
|
|
"title": title,
|
|
"url": video_url,
|
|
"upload_date": upload_date,
|
|
"likes": like_count,
|
|
"dislikes": dislike_count,
|
|
"views": view_count,
|
|
"comments": comment_count,
|
|
}
|
|
recent_videos.append(video_data)
|
|
|
|
# 输出视频信息
|
|
print(f"标题: {title}")
|
|
print(f"上传日期: {upload_date}")
|
|
print(f"喜欢: {like_count}, 不喜欢: {dislike_count}, 查看: {view_count}, 评论: {comment_count}")
|
|
print("-" * 50)
|
|
|
|
return recent_videos
|
|
except Exception as e:
|
|
print(f"爬取失败: {e}")
|
|
return []
|
|
|
|
# Pornhub 示例页面(按热度或时间排序的页面)
|
|
playlist_url = "https://www.pornhub.com/video?o=mr" # 按时间排序
|
|
videos = fetch_recent_videos_from_pornhub(playlist_url)
|
|
|
|
# 保存结果到文件
|
|
with open("recent_videos.json", "w", encoding="utf-8") as f:
|
|
json.dump(videos, f, ensure_ascii=False, indent=4)
|
|
|
|
print("已完成爬取,结果保存在 recent_videos.json 中!") |