diff --git a/thelordofporn/src/actress_fetch.py b/thelordofporn/bak_src/actress_fetch.py
similarity index 100%
rename from thelordofporn/src/actress_fetch.py
rename to thelordofporn/bak_src/actress_fetch.py
diff --git a/thelordofporn/bak_src/config.py b/thelordofporn/bak_src/config.py
new file mode 100644
index 0000000..5e637bc
--- /dev/null
+++ b/thelordofporn/bak_src/config.py
@@ -0,0 +1,91 @@
+import logging
+import os
+import inspect
+import time
+from datetime import datetime
+from logging.handlers import RotatingFileHandler
+from collections import defaultdict
+
+# 映射到宿主机的目录
+home_dir = os.path.expanduser("~")
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data/thelordofporn'
+global_share_data_dir = f'{home_dir}/sharedata'
+
+# 统计日志频率
+log_count = defaultdict(int) # 记录日志的次数
+last_log_time = defaultdict(float) # 记录上次写入的时间戳
+
+log_dir = '../log'
+class RateLimitFilter(logging.Filter):
+ """
+ 频率限制过滤器:
+ 1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略
+ 2. 如果日志速率超过 100 条/秒,发出告警
+ """
+ LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次
+
+ def filter(self, record):
+ global log_count, last_log_time
+ message_key = record.getMessage() # 获取日志内容
+
+ # 计算当前时间
+ now = time.time()
+ elapsed = now - last_log_time[message_key]
+
+ # 限制相同日志的写入频率
+ if elapsed < 60: # 60 秒内
+ log_count[message_key] += 1
+ if log_count[message_key] > self.LOG_LIMIT:
+ print('reach limit.')
+ return False # 直接丢弃
+ else:
+ log_count[message_key] = 1 # 超过 60 秒,重新计数
+
+ last_log_time[message_key] = now
+
+ return True # 允许写入日志
+
+
+
+def setup_logging(log_filename=None):
+ if log_filename is None:
+ caller_frame = inspect.stack()[1]
+ caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+ current_date = datetime.now().strftime('%Y%m%d')
+ os.makedirs(log_dir, exist_ok=True)
+ log_filename = f'{log_dir}/{caller_filename}_{current_date}.log'
+ #log_filename = f'../log/{caller_filename}_{current_date}.log'
+
+ max_log_size = 100 * 1024 * 1024 # 10 MB
+ max_log_files = 10 # 最多保留 10 个日志文件
+
+ file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
+ file_handler.setFormatter(logging.Formatter(
+ '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+ ))
+
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(logging.Formatter(
+ '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+ ))
+
+ # 创建 logger
+ logger = logging.getLogger()
+ logger.setLevel(logging.INFO)
+ logger.handlers = [] # 避免重复添加 handler
+ logger.addHandler(file_handler)
+ logger.addHandler(console_handler)
+
+ # 添加频率限制
+ rate_limit_filter = RateLimitFilter()
+ file_handler.addFilter(rate_limit_filter)
+ console_handler.addFilter(rate_limit_filter)
+
+
+# 运行示例
+if __name__ == "__main__":
+ setup_logging()
+
+ for i in range(1000):
+ logging.info("测试日志,检测频率限制")
+ time.sleep(0.01) # 模拟快速写入日志
\ No newline at end of file
diff --git a/thelordofporn/src/json_to_db.py b/thelordofporn/bak_src/json_to_db.py
similarity index 100%
rename from thelordofporn/src/json_to_db.py
rename to thelordofporn/bak_src/json_to_db.py
diff --git a/thelordofporn/src/list_fetch.py b/thelordofporn/bak_src/list_fetch.py
similarity index 100%
rename from thelordofporn/src/list_fetch.py
rename to thelordofporn/bak_src/list_fetch.py
diff --git a/thelordofporn/src/top_scenes.py b/thelordofporn/bak_src/top_scenes.py
similarity index 99%
rename from thelordofporn/src/top_scenes.py
rename to thelordofporn/bak_src/top_scenes.py
index 006580e..2513129 100644
--- a/thelordofporn/src/top_scenes.py
+++ b/thelordofporn/bak_src/top_scenes.py
@@ -23,7 +23,7 @@ list_url_scenes = 'https://thelordofporn.com/category/top-10/porn-scenes-movies/
list_url_pornstars = 'https://thelordofporn.com/category/pornstars-top-10/'
curr_novel_pages = 0
-res_dir = 'result'
+res_dir = '../result'
top_scenes_file = f'{res_dir}/top_scenes_list.csv'
top_pornstars_file = f'{res_dir}/top_pornstars_list.csv'
diff --git a/thelordofporn/src/config.py b/thelordofporn/src/config.py
index 5e637bc..b318d06 100644
--- a/thelordofporn/src/config.py
+++ b/thelordofporn/src/config.py
@@ -6,16 +6,16 @@ from datetime import datetime
from logging.handlers import RotatingFileHandler
from collections import defaultdict
-# 映射到宿主机的目录
home_dir = os.path.expanduser("~")
-global_host_data_dir = f'{home_dir}/hostdir/scripts_data/thelordofporn'
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
global_share_data_dir = f'{home_dir}/sharedata'
+log_dir = '../log'
+
# 统计日志频率
log_count = defaultdict(int) # 记录日志的次数
last_log_time = defaultdict(float) # 记录上次写入的时间戳
-log_dir = '../log'
class RateLimitFilter(logging.Filter):
"""
频率限制过滤器:
@@ -43,8 +43,7 @@ class RateLimitFilter(logging.Filter):
last_log_time[message_key] = now
- return True # 允许写入日志
-
+ return True # 允许写入日志
def setup_logging(log_filename=None):
diff --git a/thelordofporn/src/fetch.py b/thelordofporn/src/fetch.py
new file mode 100644
index 0000000..75c8b09
--- /dev/null
+++ b/thelordofporn/src/fetch.py
@@ -0,0 +1,198 @@
+
+import json
+import time
+import csv
+import argparse
+import textwrap
+import logging
+from functools import partial
+import config
+import sqlite_utils as db_tools
+import scraper
+import utils
+from urllib.parse import urljoin, urlparse
+
+config.setup_logging()
+
+debug = False
+skip_local = False
+scan_mode = 0
+update_mode = 0
+
+# 获取演员列表
+def fetch_actor_list():
+ next_url = scraper.pornstar_url
+ while next_url:
+ logging.info(f"fetching url {next_url}")
+ soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="main", identifier="content", attr_type="id"))
+ if soup:
+ list_data, next_url = scraper.parse_actor_list(soup, next_url)
+ if list_data :
+ # 插入到数据库
+ for row in list_data:
+ row_id = db_tools.insert_actor_index(row)
+ if row_id:
+ logging.debug(f"insert or update one row. row id: {row_id}, data: {row}")
+ else:
+ logging.warning(f"insert or update actor failed. data: {row}")
+ else:
+ logging.warning(f"parse_actor_list failed. url: {next_url} ")
+
+ elif status_code and status_code == scraper.http_code_404:
+ logging.warning(f'404 page. url: {next_url}')
+ break
+ elif status_code and status_code == scraper.http_code_login:
+ logging.warning(f'401 page(need login). url: {next_url}')
+ break
+ else:
+ logging.warning(f'fetch_page error. url: {next_url}')
+
+ if debug:
+ break
+ logging.info(f"fetch actor list finished.")
+
+# 更新演员信息
+def fetch_performers_detail():
+ limit_count = 5 if debug else 100
+ performers_list = []
+ last_performer_id = 0
+ abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
+
+ def get_performers(**kwargs):
+ kwargs["order_by"] = 'id asc'
+ return db_tools.query_actors(limit=limit_count, **kwargs)
+
+ while True:
+ if update_mode == 0: # 只遍历新纪录
+ performers_list = get_performers(start_id=0, is_full_data=0)
+ elif update_mode == 1: # 只遍历完整纪录
+ performers_list = get_performers(start_id=last_performer_id, is_full_data=1)
+ elif update_mode == 2: # 0+1
+ performers_list = get_performers(start_id=last_performer_id, is_full_data_not_in=abnormal_codes)
+ elif update_mode == 3: # 其他
+ performers_list = get_performers(start_id=last_performer_id, is_full_data_in =abnormal_codes)
+ else: # 全部
+ performers_list = get_performers(start_id=last_performer_id)
+
+ if len(performers_list) < 1:
+ logging.info(f'all performers fetched.')
+ break
+
+ succ_rows = 0
+ for performer in performers_list:
+ url = performer['href']
+ person = performer['name']
+
+ next_url = url
+ need_insert = True
+ while next_url:
+ logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
+ soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="main", identifier="content", attr_type="id"))
+ if soup:
+ data, next_url = scraper.parse_actor_detail(soup, next_url)
+ if data:
+ # 获取完了个人的所有影片,开始插入数据
+ performer_id = db_tools.update_actor_detail(data, is_full_data=1)
+ if performer_id:
+ logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {next_url}')
+ last_performer_id = performer_id
+ succ_rows += 1
+ else:
+ logging.warning(f'insert person: ({person}) {next_url} failed.')
+
+ elif status_code and status_code == scraper.http_code_404:
+ actor_id = db_tools.update_actor_detail({'href': next_url}, is_full_data=scraper.http_code_404)
+ logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {next_url}, Skiping...')
+ need_insert = False
+ break
+ elif status_code and status_code == scraper.http_code_login:
+ actor_id = db_tools.update_actor_detail({'href': next_url}, is_full_data=scraper.http_code_login)
+ logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {next_url}, Skiping...')
+ need_insert = False
+ break
+ else:
+ logging.warning(f'fetch_page error. url: {next_url}')
+
+ # 如果出现了401或者404,已经处理,直接跳过
+ if not need_insert:
+ continue
+ time.sleep(0.5)
+
+ logging.info(f'total request: {len(performers_list)}, succ: {succ_rows}, last performer id: {last_performer_id}')
+ # 调试break
+ if debug:
+ return True
+
+# 建立缩写到函数的映射
+function_map = {
+ "actor_list": fetch_actor_list,
+ "actors" : fetch_performers_detail,
+}
+
+# 主函数
+def main(cmd, args):
+ # 执行指定的函数
+ if cmd:
+ function_names = args.cmd.split(",") # 拆分输入
+ for short_name in function_names:
+ func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
+ if callable(func):
+ func()
+ else:
+ logging.warning(f" {short_name} is not a valid function shortcut.")
+ else: # 全量执行
+ for name, func in function_map.items():
+ if callable(func):
+ func()
+ else:
+ logging.warning(f" {short_name} is not a valid function shortcut.")
+
+ logging.info(f'all process completed!')
+
+ # TODO:
+ # 1,
+
+# 设置环境变量
+def set_env(args):
+ global debug
+ debug = args.debug
+ if debug:
+ logger = logging.getLogger()
+ logger.setLevel(logging.DEBUG)
+
+ global skip_local
+ skip_local = args.skip_local
+
+ global scan_mode
+ scan_mode = args.scan_mode
+
+ global update_mode
+ if args.update:
+ update_mode = args.update
+
+if __name__ == "__main__":
+ # 命令行参数处理
+ keys_str = ",".join(function_map.keys())
+
+ usage_examples = textwrap.dedent('''
+ 示例用法:
+ python3 ./fetch.py # 刷新列表页,并遍历新增的演员
+ python3 ./fetch.py --update=4 # 刷新列表页,并遍历全量的记录
+ python3 ./fetch.py --cmd=actor_list # 刷新列表页所有演员(三种语言)
+ python3 ./fetch.py --cmd=actors # 遍历新增的演员
+ ''')
+
+ parser = argparse.ArgumentParser(
+ description='fetch javhd data.\n\n' + usage_examples,
+ formatter_class=argparse.RawDescriptionHelpFormatter
+ )
+ #parser = argparse.ArgumentParser(description='fetch javdb data.')
+ parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
+ parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
+ parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
+ parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
+ parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
+ args = parser.parse_args()
+
+ set_env(args)
+ main(args.cmd, args)
\ No newline at end of file
diff --git a/thelordofporn/src/scraper.py b/thelordofporn/src/scraper.py
new file mode 100644
index 0000000..ef18623
--- /dev/null
+++ b/thelordofporn/src/scraper.py
@@ -0,0 +1,267 @@
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+from urllib.parse import urljoin, urlparse
+import config
+import utils
+
+# 定义基础 URL 和可变参数
+host_url = "https://thelordofporn.com/"
+pornstar_url = "https://thelordofporn.com/pornstars/"
+lang_prefix = ["ja", "en", "zh"]
+
+http_code_404 = 404
+http_code_login = 401
+http_code_local = 99
+
+save_raw_html = False
+load_from_local = False
+
+# 伪装成真实浏览器
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+ "Referer": "https://thelordofporn.com/",
+}
+
+# 创建 CloudScraper 以绕过 Cloudflare
+scraper = cloudscraper.create_scraper(
+ browser={"browser": "chrome", "platform": "windows", "mobile": False}
+)
+
+#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
+def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+ if load_from_local: # 从本地读取的逻辑
+ html = utils.read_raw_html(url)
+ if html:
+ # 预处理 HTML(如果提供了 preprocessor)
+ html_text = preprocessor(html) if preprocessor else html
+
+ soup = BeautifulSoup(html_text, parser)
+ if validator(soup): # 进行自定义页面检查
+ logging.debug(f"read from local. href: {url}")
+ return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
+
+ for attempt in range(max_retries):
+ try:
+ if 'thelordofporn.com' not in url.lower():
+ logging.error(f'wrong url format: {url}')
+ return None, None
+
+ response = scraper.get(url, headers=HEADERS)
+
+ # 处理 HTTP 状态码
+ if response.status_code == 404:
+ logging.debug(f"Page not found (404): {url}")
+ return None, http_code_404 # 直接返回 404,调用方可以跳过
+
+ response.raise_for_status() # 处理 HTTP 错误
+
+ # 检查是否发生跳转,比如到登录页面
+ if response.history:
+ logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
+ soup = BeautifulSoup(response.text, parser)
+ # 判断是否为登录页面,
+ if soup.find('nav', class_='panel form-panel'):
+ logging.debug(f"Page redirected to login page on {url}.")
+ return None, http_code_login
+
+ if save_raw_html:
+ utils.write_raw_html(url, response.text)
+
+ # 预处理 HTML(如果提供了 preprocessor)
+ html_text = preprocessor(response.text) if preprocessor else response.text
+
+ soup = BeautifulSoup(html_text, parser)
+ if validator(soup): # 进行自定义页面检查
+ return soup, response.status_code
+
+ logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+ except cloudscraper.exceptions.CloudflareChallengeError as e:
+ logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+ except cloudscraper.exceptions.CloudflareCode1020 as e:
+ logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+ except Exception as e:
+ logging.error(f"Unexpected error on {url}: {e}, Retring...")
+
+ logging.error(f'Fetching failed after max retries. {url}')
+ return None, None # 达到最大重试次数仍然失败
+
+# 修复 HTML 结构,去除多余标签并修正 标签,在获取人种的时候需要
+def preprocess_html(html):
+ return html.replace('
', '').replace(' 1 else "N/A"
+
+ # 存入列表
+ actress_list.append({
+ "pornstar": title,
+ "rating": utils.parse_numeric(rating),
+ "rank": utils.parse_numeric(rank),
+ "votes": utils.parse_numeric(votes),
+ "href": href
+ })
+
+ except Exception as e:
+ logging.error(f"parse list faild: {e}, url: {href}")
+ return None, None
+
+ # 查找下一页链接
+ next_page_tag = soup.select_one(".nav-links .next.page-numbers")
+ if next_page_tag:
+ next_page_url = urljoin(host_url, next_page_tag["href"])
+ logging.debug(f"next page: {next_page_url}")
+ else:
+ logging.debug("find all pages.")
+
+ return actress_list, next_page_url
+
+# 解析 HTML 内容,提取需要的数据
+def parse_actor_detail(soup, href):
+ # 提取基本信息
+ entry_header = soup.find("header", class_="entry-header")
+ name_el = entry_header.find("h1", class_="entry-title") if entry_header else None
+ name = name_el.text.strip() if name_el else ""
+
+ date_modified_el = soup.find("time", itemprop="dateModified")
+ if date_modified_el:
+ date_modified = date_modified_el.get("content", "").strip()
+ else:
+ date_modified = ""
+
+ # 提取 metadata
+ global_rank = ""
+ weekly_rank = ""
+ last_month_rating = ""
+ current_rating = ""
+ total_votes = ""
+
+ for div in entry_header.find_all("div", class_="porn-star-rank__item"):
+ text = div.text.strip()
+ if "Global Rank" in text:
+ global_rank = div.find("b").text.strip()
+ elif "Weekly Rank" in text:
+ weekly_rank = div.find("b").text.strip()
+
+ for item in soup.find_all("div", class_="specifications__item--horizontal"):
+ text = item.text.strip()
+ if "Last Month" in text:
+ last_month_rating = item.find("b").text.strip()
+ elif "Rating Av." in text:
+ current_rating = item.find("b").text.strip()
+ elif "Total of" in text:
+ total_votes = item.find("b").text.strip()
+
+ # 解析详细属性
+ attributes = {}
+ for row in soup.find_all("div", class_="specifications-grid-row"):
+ items = row.find_all("div", class_="specifications-grid-item")
+ if len(items) == 2:
+ label = items[0].find("h5").text.strip()
+ value = items[0].find("span").text.strip()
+ attributes[label] = value
+
+ label2 = items[1].find("h5").text.strip()
+ value2 = items[1].find("span").text.strip()
+ attributes[label2] = value2
+
+ # 解析出生信息、身高、体重等
+ birth_info = utils.parse_birth_info(attributes.get("Born", ""))
+ height_info = utils.parse_height(attributes.get("Height", ""))
+ weight_info = utils.parse_weight(attributes.get("Weight", ""))
+ alias_list = utils.clean_alias(attributes.get("Name", ""))
+
+ return {
+ 'name': name,
+ 'href': href,
+ "alias": alias_list,
+ "career_start": attributes.get("Career start", ""),
+ "measurements": attributes.get("Measurements", ""),
+ "born": attributes.get("Born", ""),
+ "height": attributes.get("Height", ""),
+ "weight": attributes.get("Weight", ""),
+ "date_modified": date_modified,
+ "global_rank": utils.parse_numeric(global_rank),
+ "weekly_rank": utils.parse_numeric(weekly_rank),
+ "last_month_rating": utils.parse_numeric(last_month_rating),
+ "current_rating": utils.parse_numeric(current_rating),
+ "total_votes": utils.parse_numeric(total_votes),
+ **birth_info,
+ **height_info,
+ **weight_info,
+ }, None
+
+###### 以下为测试代码 ######
+def test_actor_list():
+ next_url = pornstar_url
+ all_data = []
+ while next_url:
+ print(f'fetching page {next_url}')
+ soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="main", identifier="content", attr_type="id"))
+ if soup:
+ list_data, next_url = parse_actor_list(soup, next_url)
+ if list_data :
+ all_data.extend(list_data)
+ else:
+ print('get wrong page.')
+ if next_url:
+ print(f"next url: {next_url}")
+ break
+ print(all_data)
+
+def test_actor():
+ next_url = 'https://thelordofporn.com/pornstars/eva-elfie/'
+ while next_url:
+ print(f'fetching page {next_url}')
+ soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="main", identifier="content", attr_type="id"))
+ if soup:
+ data, next_url = parse_actor_detail(soup, next_url)
+ if data :
+ print(data)
+ else:
+ print('get wrong page.')
+ break
+
+if __name__ == "__main__":
+ test_actor_list()
+ test_actor()
+
\ No newline at end of file
diff --git a/thelordofporn/src/sqlite_utils.py b/thelordofporn/src/sqlite_utils.py
new file mode 100644
index 0000000..71c7bc9
--- /dev/null
+++ b/thelordofporn/src/sqlite_utils.py
@@ -0,0 +1,199 @@
+import sqlite3
+import json
+import config
+import logging
+from datetime import datetime
+
+# 连接 SQLite 数据库
+DB_PATH = f"{config.global_share_data_dir}/sqlite/shared.db" # 替换为你的数据库文件
+conn = sqlite3.connect(DB_PATH, check_same_thread=False)
+cursor = conn.cursor()
+
+tbl_name_actors = 'thelordofporn_actress'
+tbl_name_alias = 'thelordofporn_alias'
+
+# 检查 SQLite 版本
+lower_sqlite_version = False
+sqlite_version = sqlite3.sqlite_version_info
+if sqlite_version < (3, 24, 0):
+ lower_sqlite_version = True
+
+# 获取表的列名和默认值
+def get_table_columns_and_defaults(tbl_name):
+ try:
+ cursor.execute(f"PRAGMA table_info({tbl_name})")
+ columns = cursor.fetchall()
+ column_info = {}
+ for col in columns:
+ col_name = col[1]
+ default_value = col[4]
+ column_info[col_name] = default_value
+ return column_info
+ except sqlite3.Error as e:
+ logging.error(f"Error getting table columns: {e}")
+ return None
+
+# 检查并处理数据
+def check_and_process_data(data, tbl_name):
+ column_info = get_table_columns_and_defaults(tbl_name=tbl_name)
+ if column_info is None:
+ return None
+ processed_data = {}
+ for col, default in column_info.items():
+ if col == 'id': # 自增主键,不需要用户提供
+ continue
+ if col == 'created_at' or col == 'updated_at': # 日期函数,用户自己指定即可
+ continue
+ if col in data:
+ processed_data[col] = data[col]
+
+ return processed_data
+
+
+# 插入或更新数据
+def insert_or_update_common(data, tbl_name, uniq_key='href'):
+ if lower_sqlite_version:
+ return insert_or_update_common_lower(data, tbl_name, uniq_key)
+
+ try:
+ processed_data = check_and_process_data(data, tbl_name)
+ if processed_data is None:
+ return None
+
+ columns = ', '.join(processed_data.keys())
+ values = list(processed_data.values())
+ placeholders = ', '.join(['?' for _ in values])
+ update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key]) + ', updated_at=datetime(\'now\', \'localtime\')'
+
+ sql = f'''
+ INSERT INTO {tbl_name} ({columns}, updated_at)
+ VALUES ({placeholders}, datetime('now', 'localtime'))
+ ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
+ '''
+ cursor.execute(sql, values)
+ conn.commit()
+
+ # 获取插入或更新后的 report_id
+ cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
+ report_id = cursor.fetchone()[0]
+ return report_id
+ except sqlite3.Error as e:
+ logging.error(f"Error inserting or updating data: {e}")
+ return None
+
+# 插入或更新数据
+def insert_or_update_common_lower(data, tbl_name, uniq_key='href'):
+ try:
+ processed_data = check_and_process_data(data, tbl_name)
+ if processed_data is None:
+ return None
+
+ columns = ', '.join(processed_data.keys())
+ values = list(processed_data.values())
+ placeholders = ', '.join(['?' for _ in values])
+
+ # 先尝试插入数据
+ try:
+ sql = f'''
+ INSERT INTO {tbl_name} ({columns}, updated_at)
+ VALUES ({placeholders}, datetime('now', 'localtime'))
+ '''
+ cursor.execute(sql, values)
+ conn.commit()
+ except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作
+ update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key]) + ', updated_at=datetime(\'now\', \'localtime\')'
+ update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
+ update_values.append(data[uniq_key])
+ sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
+ cursor.execute(sql, update_values)
+ conn.commit()
+
+ # 获取插入或更新后的 report_id
+ cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
+ report_id = cursor.fetchone()[0]
+ return report_id
+ except sqlite3.Error as e:
+ logging.error(f"Error inserting or updating data: {e}")
+ return None
+
+# 插入books表,并判断是否需要更新
+def insert_actor_index(data):
+ try:
+ return insert_or_update_common(data, tbl_name_actors)
+ except sqlite3.Error as e:
+ logging.error(f"Error inserting or updating data: {e}")
+ return None
+
+# 更新详细信息
+def update_actor_detail(data, is_full_data=1):
+ try:
+ data['is_full_data'] = is_full_data
+ row_id = insert_or_update_common(data, tbl_name_actors)
+
+ # 写入别名表
+ for alias in data.get("alias") or []:
+ cursor.execute('''
+ INSERT OR IGNORE INTO thelordofporn_alias (actress_id, alias, updated_at)
+ VALUES (?, ?, datetime('now', 'localtime'))
+ ''', (row_id, alias))
+ conn.commit()
+
+ return row_id
+ except sqlite3.Error as e:
+ logging.error(f"Error inserting or updating data: {e}")
+ return None
+
+# 查询
+def query_actors(**filters):
+ try:
+ sql = f"SELECT href, pornstar as name FROM {tbl_name_actors} WHERE 1=1"
+ params = []
+
+ conditions = {
+ "id": " AND id = ?",
+ "href": " AND href = ?",
+ "pornstar": " AND pornstar LIKE ?",
+ "is_full_data": " AND is_full_data = ?",
+ "start_id": " AND id > ?",
+ }
+
+ for key, condition in conditions.items():
+ if key in filters:
+ sql += condition
+ if key == "pornstar":
+ params.append(f"%{filters[key]}%")
+ else:
+ params.append(filters[key])
+
+ for key in ["is_full_data_in", "is_full_data_not_in"]:
+ if key in filters:
+ values = filters[key]
+ if values:
+ placeholders = ", ".join(["?"] * len(values))
+ operator = "IN" if key == "is_full_data_in" else "NOT IN"
+ sql += f" AND is_full_data {operator} ({placeholders})"
+ params.extend(values)
+
+ if "order_by" in filters:
+ # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
+ sql += f" ORDER BY {filters['order_by']} "
+
+ if 'limit' in filters:
+ sql += " LIMIT ?"
+ params.append(filters["limit"])
+
+ cursor.execute(sql, params)
+ #return [row[0].lower() for row in cursor.fetchall()] # 返回小写
+ return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
+
+ except sqlite3.Error as e:
+ logging.error(f"查询 href 失败: {e}")
+ return None
+
+
+# 测试代码
+if __name__ == "__main__":
+
+ print(query_actors("name LIKE '%未久%'"))
+ #delete_actor_by_href('https://www.javdb.com/actors/MkAX')
+ print(query_actors())
diff --git a/thelordofporn/src/utils.py b/thelordofporn/src/utils.py
new file mode 100644
index 0000000..3f2334f
--- /dev/null
+++ b/thelordofporn/src/utils.py
@@ -0,0 +1,48 @@
+import re
+import os
+import json
+import time
+import csv
+from datetime import datetime
+from urllib.parse import urlparse
+import logging
+import config
+from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
+
+
+# 解析出生日期和地点
+def parse_birth_info(text):
+ match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text)
+ if match:
+ return {
+ "birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
+ "birth_year": match.group(3),
+ "birth_place": match.group(4),
+ }
+ return {"birth_date": text, "birth_year": "", "birth_place": ""}
+
+# 解析身高
+def parse_height(text):
+ match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text)
+ if match:
+ height_ft = f"{match.group(1)}'{match.group(2)}\""
+ return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
+ return {"height_ft": text, "height_cm": ""}
+
+# 解析体重
+def parse_weight(text):
+ match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text)
+ if match:
+ return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
+ return {"weight_lbs": text, "weight_kg": ""}
+
+def clean_alias(alias):
+ alias = re.sub(r'\(Age \d+\)', '', alias) # 去掉 (Age XX)
+ return [name.strip() for name in alias.split(',') if name.strip()]
+
+
+def parse_numeric(value):
+ try:
+ return float(value)
+ except (ValueError, TypeError):
+ return 0 # 默认值为 0