import subprocess import json from datetime import datetime, timedelta # 定义一个函数,使用 yt-dlp 提取视频元数据 def fetch_video_metadata(url): try: # 使用 yt-dlp 提取元数据,不下载视频 result = subprocess.run( ["yt-dlp", "--skip-download", "--dump-json", url], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) # 解析 JSON 数据 video_info = json.loads(result.stdout) return video_info except Exception as e: print(f"提取元数据失败: {e}") return None # 筛选最近一年的视频 def is_recent_video(upload_date): try: # 上传日期格式为 YYYYMMDD video_date = datetime.strptime(upload_date, "%Y%m%d") one_year_ago = datetime.now() - timedelta(days=365) return video_date >= one_year_ago except ValueError: return False # 主函数:爬取一个列表页面中的视频信息 def fetch_recent_videos_from_pornhub(url): try: # 获取 Pornhub 列表页面的视频元数据 result = subprocess.run( ["yt-dlp", "--flat-playlist", "--dump-json", url], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) # 逐行解析 JSON 数据 videos = [json.loads(line) for line in result.stdout.strip().split("\n")] # 存储符合条件的视频信息 recent_videos = [] for video in videos: video_url = video.get("url") print(f"正在提取视频: {video_url}") metadata = fetch_video_metadata(video_url) if metadata: upload_date = metadata.get("upload_date") if upload_date and is_recent_video(upload_date): # 解析指标 title = metadata.get("title", "未知标题") like_count = metadata.get("like_count", 0) dislike_count = metadata.get("dislike_count", 0) view_count = metadata.get("view_count", 0) comment_count = metadata.get("comment_count", 0) video_data = { "title": title, "url": video_url, "upload_date": upload_date, "likes": like_count, "dislikes": dislike_count, "views": view_count, "comments": comment_count, } recent_videos.append(video_data) # 输出视频信息 print(f"标题: {title}") print(f"上传日期: {upload_date}") print(f"喜欢: {like_count}, 不喜欢: {dislike_count}, 查看: {view_count}, 评论: {comment_count}") print("-" * 50) return recent_videos except Exception as e: print(f"爬取失败: {e}") return [] # Pornhub 示例页面(按热度或时间排序的页面) playlist_url = "https://www.pornhub.com/video?o=mr" # 按时间排序 videos = fetch_recent_videos_from_pornhub(playlist_url) # 保存结果到文件 with open("recent_videos.json", "w", encoding="utf-8") as f: json.dump(videos, f, ensure_ascii=False, indent=4) print("已完成爬取,结果保存在 recent_videos.json 中!")