This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/pornhub/cmd.py
2025-03-17 11:30:35 +08:00

92 lines
3.4 KiB
Python

import subprocess
import json
from datetime import datetime, timedelta
# 定义一个函数,使用 yt-dlp 提取视频元数据
def fetch_video_metadata(url):
try:
# 使用 yt-dlp 提取元数据,不下载视频
result = subprocess.run(
["yt-dlp", "--skip-download", "--dump-json", url],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
# 解析 JSON 数据
video_info = json.loads(result.stdout)
return video_info
except Exception as e:
print(f"提取元数据失败: {e}")
return None
# 筛选最近一年的视频
def is_recent_video(upload_date):
try:
# 上传日期格式为 YYYYMMDD
video_date = datetime.strptime(upload_date, "%Y%m%d")
one_year_ago = datetime.now() - timedelta(days=365)
return video_date >= one_year_ago
except ValueError:
return False
# 主函数:爬取一个列表页面中的视频信息
def fetch_recent_videos_from_pornhub(url):
try:
# 获取 Pornhub 列表页面的视频元数据
result = subprocess.run(
["yt-dlp", "--flat-playlist", "--dump-json", url],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
# 逐行解析 JSON 数据
videos = [json.loads(line) for line in result.stdout.strip().split("\n")]
# 存储符合条件的视频信息
recent_videos = []
for video in videos:
video_url = video.get("url")
print(f"正在提取视频: {video_url}")
metadata = fetch_video_metadata(video_url)
if metadata:
upload_date = metadata.get("upload_date")
if upload_date and is_recent_video(upload_date):
# 解析指标
title = metadata.get("title", "未知标题")
like_count = metadata.get("like_count", 0)
dislike_count = metadata.get("dislike_count", 0)
view_count = metadata.get("view_count", 0)
comment_count = metadata.get("comment_count", 0)
video_data = {
"title": title,
"url": video_url,
"upload_date": upload_date,
"likes": like_count,
"dislikes": dislike_count,
"views": view_count,
"comments": comment_count,
}
recent_videos.append(video_data)
# 输出视频信息
print(f"标题: {title}")
print(f"上传日期: {upload_date}")
print(f"喜欢: {like_count}, 不喜欢: {dislike_count}, 查看: {view_count}, 评论: {comment_count}")
print("-" * 50)
return recent_videos
except Exception as e:
print(f"爬取失败: {e}")
return []
# Pornhub 示例页面(按热度或时间排序的页面)
playlist_url = "https://www.pornhub.com/video?o=mr" # 按时间排序
videos = fetch_recent_videos_from_pornhub(playlist_url)
# 保存结果到文件
with open("recent_videos.json", "w", encoding="utf-8") as f:
json.dump(videos, f, ensure_ascii=False, indent=4)
print("已完成爬取,结果保存在 recent_videos.json 中!")