add some scripts.
This commit is contained in:
92
scripts/pornhub/cmd.py
Normal file
92
scripts/pornhub/cmd.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
import subprocess
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
# 定义一个函数,使用 yt-dlp 提取视频元数据
|
||||||
|
def fetch_video_metadata(url):
|
||||||
|
try:
|
||||||
|
# 使用 yt-dlp 提取元数据,不下载视频
|
||||||
|
result = subprocess.run(
|
||||||
|
["yt-dlp", "--skip-download", "--dump-json", url],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
# 解析 JSON 数据
|
||||||
|
video_info = json.loads(result.stdout)
|
||||||
|
return video_info
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取元数据失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 筛选最近一年的视频
|
||||||
|
def is_recent_video(upload_date):
|
||||||
|
try:
|
||||||
|
# 上传日期格式为 YYYYMMDD
|
||||||
|
video_date = datetime.strptime(upload_date, "%Y%m%d")
|
||||||
|
one_year_ago = datetime.now() - timedelta(days=365)
|
||||||
|
return video_date >= one_year_ago
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 主函数:爬取一个列表页面中的视频信息
|
||||||
|
def fetch_recent_videos_from_pornhub(url):
|
||||||
|
try:
|
||||||
|
# 获取 Pornhub 列表页面的视频元数据
|
||||||
|
result = subprocess.run(
|
||||||
|
["yt-dlp", "--flat-playlist", "--dump-json", url],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
# 逐行解析 JSON 数据
|
||||||
|
videos = [json.loads(line) for line in result.stdout.strip().split("\n")]
|
||||||
|
|
||||||
|
# 存储符合条件的视频信息
|
||||||
|
recent_videos = []
|
||||||
|
for video in videos:
|
||||||
|
video_url = video.get("url")
|
||||||
|
print(f"正在提取视频: {video_url}")
|
||||||
|
metadata = fetch_video_metadata(video_url)
|
||||||
|
|
||||||
|
if metadata:
|
||||||
|
upload_date = metadata.get("upload_date")
|
||||||
|
if upload_date and is_recent_video(upload_date):
|
||||||
|
# 解析指标
|
||||||
|
title = metadata.get("title", "未知标题")
|
||||||
|
like_count = metadata.get("like_count", 0)
|
||||||
|
dislike_count = metadata.get("dislike_count", 0)
|
||||||
|
view_count = metadata.get("view_count", 0)
|
||||||
|
comment_count = metadata.get("comment_count", 0)
|
||||||
|
|
||||||
|
video_data = {
|
||||||
|
"title": title,
|
||||||
|
"url": video_url,
|
||||||
|
"upload_date": upload_date,
|
||||||
|
"likes": like_count,
|
||||||
|
"dislikes": dislike_count,
|
||||||
|
"views": view_count,
|
||||||
|
"comments": comment_count,
|
||||||
|
}
|
||||||
|
recent_videos.append(video_data)
|
||||||
|
|
||||||
|
# 输出视频信息
|
||||||
|
print(f"标题: {title}")
|
||||||
|
print(f"上传日期: {upload_date}")
|
||||||
|
print(f"喜欢: {like_count}, 不喜欢: {dislike_count}, 查看: {view_count}, 评论: {comment_count}")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
return recent_videos
|
||||||
|
except Exception as e:
|
||||||
|
print(f"爬取失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Pornhub 示例页面(按热度或时间排序的页面)
|
||||||
|
playlist_url = "https://www.pornhub.com/video?o=mr" # 按时间排序
|
||||||
|
videos = fetch_recent_videos_from_pornhub(playlist_url)
|
||||||
|
|
||||||
|
# 保存结果到文件
|
||||||
|
with open("recent_videos.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(videos, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
print("已完成爬取,结果保存在 recent_videos.json 中!")
|
||||||
31
scripts/pornhub/config.py
Normal file
31
scripts/pornhub/config.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import inspect
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# MySQL 配置
|
||||||
|
db_config = {
|
||||||
|
'host': '172.18.0.3',
|
||||||
|
'user': 'root',
|
||||||
|
'password': 'mysqlpw',
|
||||||
|
'database': 'stockdb'
|
||||||
|
}
|
||||||
|
|
||||||
|
# 设置日志配置
|
||||||
|
def setup_logging(log_filename=None):
|
||||||
|
# 如果未传入 log_filename,则使用当前脚本名称作为日志文件名
|
||||||
|
if log_filename is None:
|
||||||
|
# 获取调用 setup_logging 的脚本文件名
|
||||||
|
caller_frame = inspect.stack()[1]
|
||||||
|
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||||
|
|
||||||
|
# 获取当前日期,格式为 yyyymmdd
|
||||||
|
current_date = datetime.now().strftime('%Y%m%d')
|
||||||
|
# 拼接 log 文件名,将日期加在扩展名前
|
||||||
|
log_filename = f'./log/{caller_filename}_{current_date}.log'
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(log_filename),
|
||||||
|
logging.StreamHandler()
|
||||||
|
])
|
||||||
51
scripts/pornhub/custom_pornhub.py
Normal file
51
scripts/pornhub/custom_pornhub.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
from yt_dlp.extractor.pornhub import PornHubIE
|
||||||
|
import re
|
||||||
|
|
||||||
|
# 不起作用,还是修改了源码
|
||||||
|
class CustomPornHubIE(PornHubIE):
|
||||||
|
def _real_extract(self, url):
|
||||||
|
# 打印当前解析的 URL
|
||||||
|
self.to_screen(f"调试: 处理的 URL 是: {url}")
|
||||||
|
|
||||||
|
# 调用父类的提取逻辑
|
||||||
|
original_data = super()._real_extract(url)
|
||||||
|
|
||||||
|
# 下载网页内容
|
||||||
|
webpage = self._download_webpage(url, url)
|
||||||
|
self.to_screen(f"调试: 收藏原始内容: {webpage}")
|
||||||
|
|
||||||
|
# 提取收藏次数(favoritesCounter 的内容)
|
||||||
|
favorites_raw = self._search_regex(
|
||||||
|
r'<span class="favoritesCounter">\s*([\dKkMm,. ]+)\s*</span>',
|
||||||
|
webpage, 'favorites count', fatal=False)
|
||||||
|
|
||||||
|
# 调试:打印收藏原始内容
|
||||||
|
self.to_screen(f"调试: 收藏原始内容: {favorites_raw}")
|
||||||
|
self.to_screen(f"调试: 收藏原始内容: {original_data}")
|
||||||
|
|
||||||
|
# 如果找到收藏次数,则进行解析和单位转换
|
||||||
|
if favorites_raw:
|
||||||
|
# 清理空格、换行,并解析数字和单位
|
||||||
|
favorites_cleaned = favorites_raw.strip().replace(',', '')
|
||||||
|
favorites_count = self._convert_to_number(favorites_cleaned)
|
||||||
|
original_data['favorites_count'] = favorites_count
|
||||||
|
else:
|
||||||
|
original_data['favorites_count'] = 0
|
||||||
|
|
||||||
|
return original_data
|
||||||
|
|
||||||
|
def _convert_to_number(self, value):
|
||||||
|
"""
|
||||||
|
将字符串解析为实际数字,支持 K(千)和 M(百万)等单位
|
||||||
|
"""
|
||||||
|
match = re.match(r'^([\d.]+)([KkMm]?)$', value)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
number = float(match.group(1))
|
||||||
|
unit = match.group(2).upper()
|
||||||
|
|
||||||
|
if unit == 'K': # 千
|
||||||
|
return int(number * 1000)
|
||||||
|
elif unit == 'M': # 百万
|
||||||
|
return int(number * 1000000)
|
||||||
|
return int(number) # 无单位,直接返回数字
|
||||||
178
scripts/pornhub/get_list.py
Normal file
178
scripts/pornhub/get_list.py
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
from yt_dlp import YoutubeDL
|
||||||
|
import config
|
||||||
|
from custom_pornhub import CustomPornHubIE
|
||||||
|
|
||||||
|
# 配置 yt-dlp 获取视频元数据
|
||||||
|
ydl_opts = {
|
||||||
|
'http_headers': {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
||||||
|
'Referer': 'https://www.pornhub.com/',
|
||||||
|
},
|
||||||
|
'extract_flat': True, # 只提取元数据,不下载视频
|
||||||
|
'skip_download': True,
|
||||||
|
'verbose': True, # 输出详细日志
|
||||||
|
}
|
||||||
|
|
||||||
|
meta_dir = './meta'
|
||||||
|
list_file = f'{meta_dir}/video_list.json'
|
||||||
|
detail_file = f'{meta_dir}/video_details.json'
|
||||||
|
|
||||||
|
config.setup_logging()
|
||||||
|
|
||||||
|
def convert_to_number(value):
|
||||||
|
"""
|
||||||
|
将字符串解析为实际数字,支持 K(千)和 M(百万)等单位
|
||||||
|
"""
|
||||||
|
match = re.match(r'^([\d.]+)([KkMm]?)$', value)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
number = float(match.group(1))
|
||||||
|
unit = match.group(2).upper()
|
||||||
|
|
||||||
|
if unit == 'K': # 千
|
||||||
|
return int(number * 1000)
|
||||||
|
elif unit == 'M': # 百万
|
||||||
|
return int(number * 1000000)
|
||||||
|
return int(number) # 无单位,直接返回数字
|
||||||
|
|
||||||
|
# 筛选最近一年的视频
|
||||||
|
def is_recent_video(upload_date):
|
||||||
|
try:
|
||||||
|
# 上传日期格式为 YYYYMMDD
|
||||||
|
video_date = datetime.strptime(upload_date, "%Y%m%d")
|
||||||
|
one_year_ago = datetime.now() - timedelta(days=365)
|
||||||
|
return video_date >= one_year_ago
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 获取视频列表
|
||||||
|
def fetch_video_list(output_file="video_list.json"):
|
||||||
|
url = "https://www.pornhub.com/video?o=mr" # 根据时间排序(最近一年)
|
||||||
|
|
||||||
|
# 爬取视频列表
|
||||||
|
with YoutubeDL(ydl_opts) as ydl:
|
||||||
|
info_dict = ydl.extract_info(url, download=False)
|
||||||
|
entries = info_dict.get('entries', [])
|
||||||
|
|
||||||
|
# 保存到文件,确保每条记录一行
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
for entry in entries:
|
||||||
|
# 清理换行符或其他不可见字符
|
||||||
|
cleaned_entry = {k: str(v).replace("\n", " ").replace("\r", " ") for k, v in entry.items()}
|
||||||
|
# 写入一行 JSON 数据
|
||||||
|
f.write(json.dumps(cleaned_entry, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
# 读取已完成的详情列表
|
||||||
|
def load_processed_details(file_name="video_details.json"):
|
||||||
|
video_list = []
|
||||||
|
id_list = []
|
||||||
|
if os.path.exists(file_name):
|
||||||
|
with open(file_name, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip() # 去掉首尾的空格或换行符
|
||||||
|
if line: # 确保行不为空
|
||||||
|
video = json.loads(line) # 将 JSON 字符串解析为 Python 字典
|
||||||
|
video_list.append(video) # 将 JSON 字符串解析为 Python 字典
|
||||||
|
id = video.get("id") # 提取 URL 字段
|
||||||
|
if id:
|
||||||
|
id_list.append(id) # 添加到 URL 列表
|
||||||
|
#return json.load(f)
|
||||||
|
return video_list, id_list
|
||||||
|
|
||||||
|
# 保存详情到文件
|
||||||
|
def save_processed_details(data, file_name="video_details.json"):
|
||||||
|
with open(file_name, 'a+', encoding='utf-8') as f:
|
||||||
|
# 清理换行符或其他不可见字符
|
||||||
|
cleaned_entry = {k: str(v).replace("\n", " ").replace("\r", " ") for k, v in data.items()}
|
||||||
|
# 写入一行 JSON 数据
|
||||||
|
f.write(json.dumps(cleaned_entry, ensure_ascii=False) + "\n")
|
||||||
|
#json.dump(data, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
# 逐个获取视频详情
|
||||||
|
def fetch_video_details(list_file="video_list.json", details_file="video_details.json"):
|
||||||
|
# 加载视频列表
|
||||||
|
video_list = []
|
||||||
|
with open(list_file, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip() # 去掉首尾的空格或换行符
|
||||||
|
if line: # 确保行不为空
|
||||||
|
video_list.append(json.loads(line)) # 将 JSON 字符串解析为 Python 字典
|
||||||
|
#video_list = json.load(f)
|
||||||
|
|
||||||
|
# 加载已处理的详情
|
||||||
|
processed_details, id_list = load_processed_details(details_file)
|
||||||
|
|
||||||
|
with YoutubeDL(ydl_opts) as ydl:
|
||||||
|
# 添加自定义提取器
|
||||||
|
ydl.add_info_extractor(CustomPornHubIE())
|
||||||
|
|
||||||
|
for video in video_list:
|
||||||
|
try:
|
||||||
|
# 获取视频详情
|
||||||
|
url = video.get('url')
|
||||||
|
if not url:
|
||||||
|
logging.info(f"Wrong video, no url: {video.get('title')}")
|
||||||
|
continue
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
query_params = parse_qs(parsed_url.query)
|
||||||
|
video_id = query_params.get('viewkey', [None])[0]
|
||||||
|
|
||||||
|
if video_id in id_list:
|
||||||
|
logging.info(f"Skipping existed video: (ID: {video_id}) {video.get('title')}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logging.info(f"processing video: {video.get('title')} (ID: {video_id})")
|
||||||
|
video_info = ydl.extract_info(url, download=False)
|
||||||
|
|
||||||
|
# 自定义提取收藏、喜欢、点赞等信息(假设这些信息可以从网页中解析)
|
||||||
|
video_data = {
|
||||||
|
'title': video_info.get('title'),
|
||||||
|
'url': url,
|
||||||
|
'id': video_id,
|
||||||
|
"upload_date": video_info.get('upload_date'),
|
||||||
|
'view_count': video_info.get('view_count'),
|
||||||
|
'like_count': video_info.get('like_count'),
|
||||||
|
'dislike_count': video_info.get('dislike_count'),
|
||||||
|
'favorite_count': convert_to_number(video_info.get('favorite_count')), # 可能需要手动提取
|
||||||
|
}
|
||||||
|
|
||||||
|
# 每次保存进度
|
||||||
|
save_processed_details(video_data, details_file)
|
||||||
|
logging.info(f"get video detail succ: {video.get('title')}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"get video detail error: {video.get('title')}, msg: {e}")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
logging.info("fetch_video_details done!")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python script.py <cmd>")
|
||||||
|
print("cmd: get_list, get_detail, get_all")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
cmd = sys.argv[1]
|
||||||
|
|
||||||
|
if cmd == "get_list":
|
||||||
|
fetch_video_list(list_file) # 之前已经实现的获取列表功能
|
||||||
|
elif cmd == "get_detail":
|
||||||
|
fetch_video_details(list_file, detail_file) # 之前已经实现的获取详情功能
|
||||||
|
elif cmd == "get_all":
|
||||||
|
fetch_video_list(list_file)
|
||||||
|
fetch_video_details(list_file, detail_file)
|
||||||
|
else:
|
||||||
|
print(f"Unknown command: {cmd}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
35
scripts/pornhub/test.py
Normal file
35
scripts/pornhub/test.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from yt_dlp import YoutubeDL
|
||||||
|
from custom_pornhub import CustomPornHubIE
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# 配置日志记录
|
||||||
|
logger = logging.getLogger('yt_dlp')
|
||||||
|
logger.setLevel(logging.DEBUG) # 设置为 DEBUG 级别
|
||||||
|
handler = logging.StreamHandler() # 输出到终端
|
||||||
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
# 自定义选项
|
||||||
|
ydl_opts = {
|
||||||
|
'extract_flat': True, # 仅提取元数据
|
||||||
|
'skip_download': True, # 跳过视频下载
|
||||||
|
'verbose': True, # 启用详细日志
|
||||||
|
'logger': logger, # 使用自定义日志记录器
|
||||||
|
}
|
||||||
|
|
||||||
|
with YoutubeDL(ydl_opts) as ydl:
|
||||||
|
# 添加自定义提取器
|
||||||
|
ydl.add_info_extractor(CustomPornHubIE())
|
||||||
|
|
||||||
|
# 打印已注册的提取器列表
|
||||||
|
print("调试: 已注册的提取器列表:")
|
||||||
|
for ie_name in ydl._ies:
|
||||||
|
print(ie_name)
|
||||||
|
|
||||||
|
# 提取视频信息
|
||||||
|
info = ydl.extract_info("https://www.pornhub.com/view_video.php?viewkey=6710f3bc00200")
|
||||||
|
|
||||||
|
# 输出信息
|
||||||
|
print(info)
|
||||||
|
|
||||||
|
# 输出收藏次数
|
||||||
|
print(f"收藏次数: {info.get('favorites_count', '未找到')}")
|
||||||
Reference in New Issue
Block a user