add some scripts.

2024-11-22 14:18:08 +08:00
parent a261a9e7ed
commit ba31911477
5 changed files with 387 additions and 0 deletions
--- a/scripts/pornhub/cmd.py
+++ b/scripts/pornhub/cmd.py
@ -0,0 +1,92 @@
+import subprocess
+import json
+from datetime import datetime, timedelta
+
+# 定义一个函数，使用 yt-dlp 提取视频元数据
+def fetch_video_metadata(url):
+    try:
+        # 使用 yt-dlp 提取元数据，不下载视频
+        result = subprocess.run(
+            ["yt-dlp", "--skip-download", "--dump-json", url],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        # 解析 JSON 数据
+        video_info = json.loads(result.stdout)
+        return video_info
+    except Exception as e:
+        print(f"提取元数据失败: {e}")
+        return None
+
+# 筛选最近一年的视频
+def is_recent_video(upload_date):
+    try:
+        # 上传日期格式为 YYYYMMDD
+        video_date = datetime.strptime(upload_date, "%Y%m%d")
+        one_year_ago = datetime.now() - timedelta(days=365)
+        return video_date >= one_year_ago
+    except ValueError:
+        return False
+
+# 主函数：爬取一个列表页面中的视频信息
+def fetch_recent_videos_from_pornhub(url):
+    try:
+        # 获取 Pornhub 列表页面的视频元数据
+        result = subprocess.run(
+            ["yt-dlp", "--flat-playlist", "--dump-json", url],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        # 逐行解析 JSON 数据
+        videos = [json.loads(line) for line in result.stdout.strip().split("\n")]
+
+        # 存储符合条件的视频信息
+        recent_videos = []
+        for video in videos:
+            video_url = video.get("url")
+            print(f"正在提取视频: {video_url}")
+            metadata = fetch_video_metadata(video_url)
+
+            if metadata:
+                upload_date = metadata.get("upload_date")
+                if upload_date and is_recent_video(upload_date):
+                    # 解析指标
+                    title = metadata.get("title", "未知标题")
+                    like_count = metadata.get("like_count", 0)
+                    dislike_count = metadata.get("dislike_count", 0)
+                    view_count = metadata.get("view_count", 0)
+                    comment_count = metadata.get("comment_count", 0)
+
+                    video_data = {
+                        "title": title,
+                        "url": video_url,
+                        "upload_date": upload_date,
+                        "likes": like_count,
+                        "dislikes": dislike_count,
+                        "views": view_count,
+                        "comments": comment_count,
+                    }
+                    recent_videos.append(video_data)
+
+                    # 输出视频信息
+                    print(f"标题: {title}")
+                    print(f"上传日期: {upload_date}")
+                    print(f"喜欢: {like_count}, 不喜欢: {dislike_count}, 查看: {view_count}, 评论: {comment_count}")
+                    print("-" * 50)
+
+        return recent_videos
+    except Exception as e:
+        print(f"爬取失败: {e}")
+        return []
+
+# Pornhub 示例页面（按热度或时间排序的页面）
+playlist_url = "https://www.pornhub.com/video?o=mr"  # 按时间排序
+videos = fetch_recent_videos_from_pornhub(playlist_url)
+
+# 保存结果到文件
+with open("recent_videos.json", "w", encoding="utf-8") as f:
+    json.dump(videos, f, ensure_ascii=False, indent=4)
+
+print("已完成爬取，结果保存在 recent_videos.json 中！")
--- a/scripts/pornhub/config.py
+++ b/scripts/pornhub/config.py
@ -0,0 +1,31 @@
+import logging
+import os
+import inspect
+from datetime import datetime
+
+# MySQL 配置
+db_config = {
+    'host': '172.18.0.3',
+    'user': 'root',
+    'password': 'mysqlpw',
+    'database': 'stockdb'
+}
+
+# 设置日志配置
+def setup_logging(log_filename=None):
+    # 如果未传入 log_filename，则使用当前脚本名称作为日志文件名
+    if log_filename is None:
+        # 获取调用 setup_logging 的脚本文件名
+        caller_frame = inspect.stack()[1]
+        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+
+        # 获取当前日期，格式为 yyyymmdd
+        current_date = datetime.now().strftime('%Y%m%d')
+        # 拼接 log 文件名，将日期加在扩展名前
+        log_filename = f'./log/{caller_filename}_{current_date}.log'
+    
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
+                        handlers=[
+                            logging.FileHandler(log_filename),
+                            logging.StreamHandler()
+                        ])
--- a/scripts/pornhub/custom_pornhub.py
+++ b/scripts/pornhub/custom_pornhub.py
@ -0,0 +1,51 @@
+from yt_dlp.extractor.pornhub import PornHubIE
+import re
+
+# 不起作用，还是修改了源码
+class CustomPornHubIE(PornHubIE):
+    def _real_extract(self, url):
+        # 打印当前解析的 URL
+        self.to_screen(f"调试: 处理的 URL 是: {url}")
+
+        # 调用父类的提取逻辑
+        original_data = super()._real_extract(url)
+        
+        # 下载网页内容
+        webpage = self._download_webpage(url, url)
+        self.to_screen(f"调试: 收藏原始内容: {webpage}")
+        
+        # 提取收藏次数（favoritesCounter 的内容）
+        favorites_raw = self._search_regex(
+            r'<span class="favoritesCounter">\s*([\dKkMm,. ]+)\s*</span>',
+            webpage, 'favorites count', fatal=False)
+        
+        # 调试：打印收藏原始内容
+        self.to_screen(f"调试: 收藏原始内容: {favorites_raw}")
+        self.to_screen(f"调试: 收藏原始内容: {original_data}")
+
+        # 如果找到收藏次数，则进行解析和单位转换
+        if favorites_raw:
+            # 清理空格、换行，并解析数字和单位
+            favorites_cleaned = favorites_raw.strip().replace(',', '')
+            favorites_count = self._convert_to_number(favorites_cleaned)
+            original_data['favorites_count'] = favorites_count
+        else:
+            original_data['favorites_count'] = 0
+        
+        return original_data
+
+    def _convert_to_number(self, value):
+        """
+        将字符串解析为实际数字，支持 K（千）和 M（百万）等单位
+        """
+        match = re.match(r'^([\d.]+)([KkMm]?)$', value)
+        if not match:
+            return None
+        number = float(match.group(1))
+        unit = match.group(2).upper()
+        
+        if unit == 'K':  # 千
+            return int(number * 1000)
+        elif unit == 'M':  # 百万
+            return int(number * 1000000)
+        return int(number)  # 无单位，直接返回数字
--- a/scripts/pornhub/get_list.py
+++ b/scripts/pornhub/get_list.py
@ -0,0 +1,178 @@
+import json
+import sys
+import os
+import time
+import re
+import logging
+from datetime import datetime, timedelta
+from urllib.parse import urlparse, parse_qs
+from yt_dlp import YoutubeDL
+import config
+from custom_pornhub import CustomPornHubIE
+
+# 配置 yt-dlp 获取视频元数据
+ydl_opts = {    
+    'http_headers': {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+        'Referer': 'https://www.pornhub.com/',
+    },
+    'extract_flat': True,  # 只提取元数据，不下载视频
+    'skip_download': True,
+    'verbose': True,       # 输出详细日志
+}
+
+meta_dir = './meta'
+list_file = f'{meta_dir}/video_list.json'
+detail_file = f'{meta_dir}/video_details.json'
+
+config.setup_logging()
+
+def convert_to_number(value):
+    """
+    将字符串解析为实际数字，支持 K（千）和 M（百万）等单位
+    """
+    match = re.match(r'^([\d.]+)([KkMm]?)$', value)
+    if not match:
+        return None
+    number = float(match.group(1))
+    unit = match.group(2).upper()
+    
+    if unit == 'K':  # 千
+        return int(number * 1000)
+    elif unit == 'M':  # 百万
+        return int(number * 1000000)
+    return int(number)  # 无单位，直接返回数字
+
+# 筛选最近一年的视频
+def is_recent_video(upload_date):
+    try:
+        # 上传日期格式为 YYYYMMDD
+        video_date = datetime.strptime(upload_date, "%Y%m%d")
+        one_year_ago = datetime.now() - timedelta(days=365)
+        return video_date >= one_year_ago
+    except ValueError:
+        return False
+
+# 获取视频列表
+def fetch_video_list(output_file="video_list.json"):
+    url = "https://www.pornhub.com/video?o=mr"  # 根据时间排序（最近一年）
+    
+    # 爬取视频列表
+    with YoutubeDL(ydl_opts) as ydl:
+        info_dict = ydl.extract_info(url, download=False)
+        entries = info_dict.get('entries', [])
+
+        # 保存到文件，确保每条记录一行
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for entry in entries:
+                # 清理换行符或其他不可见字符
+                cleaned_entry = {k: str(v).replace("\n", " ").replace("\r", " ") for k, v in entry.items()}
+                # 写入一行 JSON 数据
+                f.write(json.dumps(cleaned_entry, ensure_ascii=False) + "\n")
+
+# 读取已完成的详情列表
+def load_processed_details(file_name="video_details.json"):
+    video_list = []
+    id_list = []
+    if os.path.exists(file_name):
+        with open(file_name, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()  # 去掉首尾的空格或换行符
+                if line:  # 确保行不为空
+                    video = json.loads(line)  # 将 JSON 字符串解析为 Python 字典
+                    video_list.append(video)  # 将 JSON 字符串解析为 Python 字典
+                    id = video.get("id")  # 提取 URL 字段
+                    if id:
+                        id_list.append(id)  # 添加到 URL 列表
+            #return json.load(f)
+    return video_list, id_list
+
+# 保存详情到文件
+def save_processed_details(data, file_name="video_details.json"):
+    with open(file_name, 'a+', encoding='utf-8') as f:
+        # 清理换行符或其他不可见字符
+        cleaned_entry = {k: str(v).replace("\n", " ").replace("\r", " ") for k, v in data.items()}
+        # 写入一行 JSON 数据
+        f.write(json.dumps(cleaned_entry, ensure_ascii=False) + "\n")
+        #json.dump(data, f, ensure_ascii=False, indent=4)
+
+# 逐个获取视频详情
+def fetch_video_details(list_file="video_list.json", details_file="video_details.json"):
+    # 加载视频列表
+    video_list = []
+    with open(list_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()  # 去掉首尾的空格或换行符
+            if line:  # 确保行不为空
+                video_list.append(json.loads(line))  # 将 JSON 字符串解析为 Python 字典
+        #video_list = json.load(f)
+
+    # 加载已处理的详情
+    processed_details, id_list = load_processed_details(details_file)
+
+    with YoutubeDL(ydl_opts) as ydl:
+        # 添加自定义提取器
+        ydl.add_info_extractor(CustomPornHubIE())
+
+        for video in video_list:
+            try:
+                # 获取视频详情
+                url = video.get('url')
+                if not url:
+                    logging.info(f"Wrong video, no url: {video.get('title')}")
+                    continue
+                parsed_url = urlparse(url)
+                query_params = parse_qs(parsed_url.query)
+                video_id = query_params.get('viewkey', [None])[0]
+
+                if video_id in id_list:
+                    logging.info(f"Skipping existed video: (ID: {video_id}) {video.get('title')}")
+                    continue
+
+                logging.info(f"processing video: {video.get('title')} (ID: {video_id})")
+                video_info = ydl.extract_info(url, download=False)
+
+                # 自定义提取收藏、喜欢、点赞等信息（假设这些信息可以从网页中解析）
+                video_data = {
+                    'title': video_info.get('title'),
+                    'url': url,
+                    'id': video_id,
+                    "upload_date": video_info.get('upload_date'),
+                    'view_count': video_info.get('view_count'),
+                    'like_count': video_info.get('like_count'),
+                    'dislike_count': video_info.get('dislike_count'),
+                    'favorite_count': convert_to_number(video_info.get('favorite_count')),  # 可能需要手动提取
+                }
+
+                # 每次保存进度
+                save_processed_details(video_data, details_file)
+                logging.info(f"get video detail succ: {video.get('title')}")
+
+            except Exception as e:
+                logging.error(f"get video detail error: {video.get('title')}, msg: {e}")
+            time.sleep(1)
+
+    logging.info("fetch_video_details done!")
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <cmd>")
+        print("cmd: get_list, get_detail, get_all")
+        sys.exit(1)
+
+    cmd = sys.argv[1]
+
+    if cmd == "get_list":
+        fetch_video_list(list_file)  # 之前已经实现的获取列表功能
+    elif cmd == "get_detail":
+        fetch_video_details(list_file, detail_file)  # 之前已经实现的获取详情功能
+    elif cmd == "get_all":
+        fetch_video_list(list_file)
+        fetch_video_details(list_file, detail_file)
+    else:
+        print(f"Unknown command: {cmd}")
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/pornhub/test.py
+++ b/scripts/pornhub/test.py
@ -0,0 +1,35 @@
+from yt_dlp import YoutubeDL
+from custom_pornhub import CustomPornHubIE
+import logging
+
+# 配置日志记录
+logger = logging.getLogger('yt_dlp')
+logger.setLevel(logging.DEBUG)  # 设置为 DEBUG 级别
+handler = logging.StreamHandler()  # 输出到终端
+logger.addHandler(handler)
+
+# 自定义选项
+ydl_opts = {
+    'extract_flat': True,  # 仅提取元数据
+    'skip_download': True,  # 跳过视频下载
+    'verbose': True,        # 启用详细日志
+    'logger': logger,       # 使用自定义日志记录器
+}
+
+with YoutubeDL(ydl_opts) as ydl:
+    # 添加自定义提取器
+    ydl.add_info_extractor(CustomPornHubIE())
+
+    # 打印已注册的提取器列表
+    print("调试: 已注册的提取器列表:")
+    for ie_name in ydl._ies:
+        print(ie_name)
+
+    # 提取视频信息
+    info = ydl.extract_info("https://www.pornhub.com/view_video.php?viewkey=6710f3bc00200")
+    
+    # 输出信息
+    print(info)
+
+    # 输出收藏次数
+    print(f"收藏次数: {info.get('favorites_count', '未找到')}")