From f1e5abd6b33ec2528f6ee7e53613a09e6fda5f9b Mon Sep 17 00:00:00 2001 From: oscar Date: Mon, 3 Mar 2025 19:01:41 +0800 Subject: [PATCH] modify some scripts. --- gitignore | 1 + .../0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json | 20 + .../2f582dcf-192e-4adf-9d60-447df8f16b9c.json | 56 ++ .../9af4e9f4-68ce-47ec-a7d7-fde92862af57.json | 70 +++ .../ca753243-8e3a-49ac-88aa-357055187e8c.json | 85 +++ scripts/iafd/src/config.py | 26 + scripts/iafd/src/fetch.py | 320 +++++++++++ scripts/iafd/src/iafd_scraper.py | 513 ++++++++++++++++++ scripts/iafd/src/sqlite_utils.py | 459 ++++++++++++++++ scripts/iafd/src/utils.py | 92 ++++ 10 files changed, 1642 insertions(+) create mode 100644 scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json create mode 100644 scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json create mode 100644 scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json create mode 100644 scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json create mode 100644 scripts/iafd/src/config.py create mode 100644 scripts/iafd/src/fetch.py create mode 100644 scripts/iafd/src/iafd_scraper.py create mode 100644 scripts/iafd/src/sqlite_utils.py create mode 100644 scripts/iafd/src/utils.py diff --git a/gitignore b/gitignore index a57550b..d482c08 100644 --- a/gitignore +++ b/gitignore @@ -12,6 +12,7 @@ scripts/iafd/data/tmp/ scripts/iafd/result/tmp/ scripts/iafd/result/bak/ scripts/iafd/result/performers/ +scripts/iafd/result/movies/ scripts/iafd/log/ scripts/thelordofporn/log/ scripts/vixen_group/log/ diff --git a/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json b/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json new file mode 100644 index 0000000..e04eb58 --- /dev/null +++ b/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json @@ -0,0 +1,20 @@ +{ + "href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a", + "title": "Barebackin' Men", + "Minutes": "No Data", + "Distributor": "1 Distribution", + "Studio": "1 Distribution", + "ReleaseDate": "No Data", + "AddedtoIAFDDate": "Jan 1, 2006", + "All-Girl": "No", + "All-Male": "Yes", + "Compilation": "No", + "Webscene": "", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm", + "Performers": [], + "SceneBreakdowns": [], + "AppearsIn": [] +} \ No newline at end of file diff --git a/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json b/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json new file mode 100644 index 0000000..51f224d --- /dev/null +++ b/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json @@ -0,0 +1,56 @@ +{ + "href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c", + "title": "Slim Goodies POV 2", + "Minutes": "84", + "Distributor": "Exotic Vixen Films", + "Studio": "Exotic Vixen Films", + "ReleaseDate": "No Data", + "AddedtoIAFDDate": "Jan 17, 2024", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "", + "Director": "Just Mike Starks", + "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm", + "Performers": [ + { + "name": "Amica Mea", + "href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c", + "tags": [ + "Amica Mea" + ] + }, + { + "name": "Baby Breezy", + "href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3", + "tags": [ + "Baby Breezy" + ] + }, + { + "name": "Blu Mere", + "href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905", + "tags": [ + "Blu Mere" + ] + }, + { + "name": "Just Mike Starks", + "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", + "tags": [ + "Just Mike Starks" + ] + }, + { + "name": "Mocha Menage", + "href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd", + "tags": [ + "Mocha Menage" + ] + } + ], + "SceneBreakdowns": [], + "AppearsIn": [] +} \ No newline at end of file diff --git a/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json b/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json new file mode 100644 index 0000000..94cc88d --- /dev/null +++ b/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json @@ -0,0 +1,70 @@ +{ + "href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57", + "title": "Atlanta U: College Freaks", + "Minutes": "No Data", + "Distributor": "Exotic Vixen Films", + "Studio": "Exotic Vixen Films", + "ReleaseDate": "No Data", + "AddedtoIAFDDate": "Sep 19, 2020", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "", + "Director": "Just Mike Starks", + "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm", + "Performers": [ + { + "name": "Aaliyah Ali", + "href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38", + "tags": [ + "Aaliyah Ali" + ] + }, + { + "name": "Bones Montana", + "href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4", + "tags": [ + "Bones Montana" + ] + }, + { + "name": "Cameron Cox", + "href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26", + "tags": [ + "Cameron Cox" + ] + }, + { + "name": "Crystal Cooper", + "href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980", + "tags": [ + "Crystal Cooper" + ] + }, + { + "name": "Jazmine Adore", + "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4", + "tags": [ + "Jazmine Adore" + ] + }, + { + "name": "Just Mike Starks", + "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", + "tags": [ + "Just Mike Starks" + ] + }, + { + "name": "Lala Ivey", + "href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b", + "tags": [ + "Lala Ivey" + ] + } + ], + "SceneBreakdowns": [], + "AppearsIn": [] +} \ No newline at end of file diff --git a/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json b/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json new file mode 100644 index 0000000..08af9d1 --- /dev/null +++ b/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json @@ -0,0 +1,85 @@ +{ + "href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c", + "title": "Slim Goodies POV", + "Minutes": "61", + "Distributor": "Exotic Vixen Films", + "Studio": "Exotic Vixen Films", + "ReleaseDate": "No Data", + "AddedtoIAFDDate": "Sep 19, 2020", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "", + "Director": "Just Mike Starks", + "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm", + "Performers": [ + { + "name": "Gina Ferrero", + "href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd", + "tags": [ + "Gina Ferrero" + ] + }, + { + "name": "Imani Reign", + "href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce", + "tags": [ + "Imani Reign" + ] + }, + { + "name": "Jazmine Adore", + "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4", + "tags": [ + "Jazmine Adore" + ] + }, + { + "name": "Just Mike Starks", + "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", + "tags": [ + "Just Mike Starks" + ] + }, + { + "name": "Niomie King", + "href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a", + "tags": [ + "Niomie King" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Imani Reign", + "Just Mike Starks" + ] + }, + { + "scene": "Scene 2", + "performers": [ + "Jazmine Adore", + "Just Mike Starks" + ] + }, + { + "scene": "Scene 3", + "performers": [ + "Gina Ferrero", + "Just Mike Starks" + ] + }, + { + "scene": "Scene 4", + "performers": [ + "Niomie King", + "Just Mike Starks" + ] + } + ], + "AppearsIn": [] +} \ No newline at end of file diff --git a/scripts/iafd/src/config.py b/scripts/iafd/src/config.py new file mode 100644 index 0000000..fc7fc09 --- /dev/null +++ b/scripts/iafd/src/config.py @@ -0,0 +1,26 @@ +import logging +import os +import inspect +from datetime import datetime + +global_share_data_dir = '/root/sharedata' +global_host_data_dir = '/root/hostdir/scripts_data' + +# 设置日志配置 +def setup_logging(log_filename=None): + # 如果未传入 log_filename,则使用当前脚本名称作为日志文件名 + if log_filename is None: + # 获取调用 setup_logging 的脚本文件名 + caller_frame = inspect.stack()[1] + caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0] + + # 获取当前日期,格式为 yyyymmdd + current_date = datetime.now().strftime('%Y%m%d') + # 拼接 log 文件名,将日期加在扩展名前 + log_filename = f'../log/{caller_filename}_{current_date}.log' + + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s', + handlers=[ + logging.FileHandler(log_filename), + logging.StreamHandler() + ]) \ No newline at end of file diff --git a/scripts/iafd/src/fetch.py b/scripts/iafd/src/fetch.py new file mode 100644 index 0000000..4d670b3 --- /dev/null +++ b/scripts/iafd/src/fetch.py @@ -0,0 +1,320 @@ + +import json +import time +import csv +import argparse +import logging +from functools import partial +import config +import sqlite_utils as utils +import iafd_scraper as scraper +import utils as func + +config.setup_logging() + +debug = True + +# 按星座获取演员列表,无翻页 +def fetch_performers_by_astro(existed_performer_hrefs): + performers = [] + + for astro in scraper.astro_list: + url = scraper.astr_base_url + astro + logging.info(f"Fetching data for {astro}, url {url} ...") + + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id")) + if soup: + list_data, next_url = scraper.parse_page_astro(soup, astro) + if list_data: + for row in list_data : + if row['href'] not in existed_performer_hrefs: + performers.append({ + 'person' : row['person'], + 'href' : row['href'] + }) + else: + logging.warning(f'fetch astro error. {url} ...') + else: + logging.warning(f'fetch astro error. {url} ...') + + # 调试添加break + if debug: + break + return performers + + +# 按生日获取演员列表,无翻页 +def fetch_performers_by_birth(existed_performer_hrefs): + performers = [] + + for month in range(1, 13): # 遍历1到12月 + for day in range(1, 32): # 遍历1到31天 + url = scraper.birth_base_url.format(month=month, day=day) + logging.info(f"Fetching data for birth, url {url}") + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class")) + if soup: + list_data, next_url = scraper.parse_page_birth(soup, month, day) + if list_data: + for row in list_data : + if row['href'] not in existed_performer_hrefs: + performers.append({ + 'person' : row['person'], + 'href' : row['href'] + }) + else: + logging.warning(f'fetch astro error. {url} ...') + else: + logging.warning(f'fetch astro error. {url} ...') + + # 调试添加break + if debug: + return performers + + return performers + +# 处理带空格的种族名 +def format_ethnic(ethnic): + return ethnic.replace(' ', '+') + +# 按人种获取演员列表,有翻页 +def fetch_performers_by_ethnic(existed_performer_hrefs): + performers = [] + + for ethnic in scraper.ethnic_list: + url = scraper.ethnic_url + format_ethnic(ethnic) + next_url = url + + while next_url: + logging.info(f"Fetching data for {ethnic}, url {url} ...") + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), + parser="lxml", preprocessor=scraper.preprocess_html) + if soup: + list_data, next_url = scraper.parse_page_ethnic(soup, ethnic) + if list_data: + for row in list_data : + if row['href'] not in existed_performer_hrefs: + performers.append({ + 'person' : row['person'], + 'href' : row['href'] + }) + else: + logging.warning(f'fetch astro error. {url} ...') + else: + logging.warning(f'fetch astro error. {url} ...') + + # 调试添加break + if debug: + return performers + return performers + + +# 获取distributors列表 +def fetch_distributors_list(existed_distributors_href): + url = scraper.distributors_list_url + distributors_list = [] + + logging.info(f"Fetching data for distributors list, url {url} ...") + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name")) + if soup: + list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib") + if list_data: + for row in list_data : + dis_url = scraper.distributors_base_url + row['href'] + if dis_url in existed_distributors_href : + continue + distributors_list.append({ + 'name' : row['name'], + 'href' : dis_url + }) + else: + logging.warning(f'fetch astro error. {url} ...') + else: + logging.warning(f'fetch astro error. {url} ...') + return distributors_list + +# 获取studios列表 +def fetch_studios_list(existed_studios_href): + url = scraper.studios_list_url + studios_list = [] + + logging.info(f"Fetching data for studios list, url {url} ...") + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name")) + if soup: + list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio") + if list_data: + for row in list_data : + stu_url = scraper.studios_base_url + row['href'] + if stu_url in existed_studios_href: + continue + studios_list.append({ + 'name' : row['name'], + 'href' : stu_url + }) + else: + logging.warning(f'fetch astro error. {url} ...') + else: + logging.warning(f'fetch astro error. {url} ...') + return studios_list + +# 获取更新 +def check_update(): + # 读取数据库中的演员列表 + existed_performer_hrefs = utils.query_performer_hrefs() + if not existed_performer_hrefs: + logging.warning(f'get existed performers from db error.') + return None + + # 从列表页获取新的演员 + new_performers = [] + #new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs)) + #new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs)) + new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs)) + + # 逐个获取演员信息,并写入到db中 + new_performers = list({item["href"]: item for item in new_performers}.values()) + logging.info(f'get new performers count: {len(new_performers)} ') + for performer in new_performers: + url = performer['href'] + person = performer['person'] + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id")) + if soup: + data, credits = scraper.parse_page_performer(soup) + if data: + performer_id = utils.insert_or_update_performer({ + 'href': url, + 'person': person, + **data + }) + if performer_id: + logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}') + else: + logging.warning(f'insert person: {person} {url} failed.') + + # 写入到本地json文件 + func.write_person_json(person, url, { + 'href': url, + 'person': person, + **data, + 'credits': credits if credits else {} + }) + else: + logging.warning(f'parse_page_performer error. person: {person}, url: {url}') + else: + logging.warning(f'fetch_page error. person: {person}, url: {url}') + # 调试break + if debug: + break + + # 从数据库读取distributors列表 + existed_distributors_href = utils.query_distributor_hrefs() + if existed_distributors_href is None: + logging.warning(f'get existed distributors from db error.') + return + new_distributors = fetch_distributors_list(existed_distributors_href) + for dist in new_distributors: + dist_id = utils.insert_or_update_distributor(dist) + if dist_id: + logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}') + else: + logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}') + + # 从数据库读取studios列表 + existed_studios_href = utils.query_studio_hrefs() + if existed_studios_href is None: + logging.warning(f'get existed studios from db error.') + return + new_studios = fetch_studios_list(existed_studios_href) + for stu in new_studios: + stu_id = utils.insert_or_update_studio(stu) + if stu_id: + logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}') + else: + logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}') + + # 从数据库中读取影片列表 + existed_movies = utils.query_movie_hrefs() + if existed_movies is None: + logging.warning(f'load movies from db error') + return + new_movies = [] + new_movie_hrefs = [] + + # 遍历所有 distributors,获取 movies 列表 + existed_distributors_href = utils.query_distributor_hrefs(name='vixen') + if existed_distributors_href is None: + logging.warning(f'get existed distributors from db error.') + return + for url in existed_distributors_href: + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id")) + if soup: + list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable') + if list_data: + for movie in list_data: + if movie['href'] in existed_movies: + continue + new_movies.append({ + 'title' : movie['title'], + 'href' : movie['href'] + }) + new_movie_hrefs.append(movie['href']) + else : + logging.warning(f'parse_page_movie error. url: {url}') + # 调试增加brak + if debug: + break + logging.info(f'all new moives found for distributors, now total new {len(new_movies)}') + + # 遍历所有 studios,获取 movies 列表 + existed_studios_href = utils.query_studio_hrefs(name='vixen') + if existed_studios_href is None: + logging.warning(f'get existed studios from db error.') + return + for url in existed_studios_href: + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id")) + if soup: + list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio') + if list_data: + for movie in list_data: + if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs: + continue + new_movies.append({ + 'title' : movie['title'], + 'href' : movie['href'] + }) + new_movie_hrefs.append(movie['href']) + else : + logging.warning(f'parse_page_movie error. url: {url}') + # 调试增加brak + if debug: + break + logging.info(f'all new moives found for studios, now total new {len(new_movies)}') + + # 对新的影片,逐个获取内容 + new_movies = list({item["href"]: item for item in new_movies}.values()) + logging.info(f'get merged new movies, count: {len(new_movies)} ') + for movie in new_movies: + url = movie['href'] + title = movie['title'] + soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class")) + if soup: + movie_data = scraper.parse_page_movie(soup, url, title) + if movie_data : + movie_id = utils.insert_or_update_movie(movie_data) + if movie_id: + logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}') + else: + logging.warning(f'insert movie {url} failed.') + + # 写入到本地json文件 + func.write_movie_json(url, movie_data) + else: + logging.warning(f'parse_page_movie error. url: {url}') + else: + logging.warning(f'fetch_page error. url: {url}') + # 调试增加break + if debug: + break + + logging.info(f'all process completed!') +if __name__ == "__main__": + check_update() \ No newline at end of file diff --git a/scripts/iafd/src/iafd_scraper.py b/scripts/iafd/src/iafd_scraper.py new file mode 100644 index 0000000..1575b24 --- /dev/null +++ b/scripts/iafd/src/iafd_scraper.py @@ -0,0 +1,513 @@ + +import cloudscraper +import time +import json +import csv +import logging +import signal +import sys +import os +import re +from bs4 import BeautifulSoup +from requests.exceptions import RequestException +from functools import partial +import config + +# 定义基础 URL 和可变参数 +host_url = "https://www.iafd.com" + +astr_base_url = f"{host_url}/astrology.rme/sign=" +astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces'] + +birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}" + +ethnic_url = f"{host_url}/lookupethnic.rme/ethnic=" +ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian'] + +distributors_list_url = f'{host_url}/distrib.asp' +distributors_base_url = f"{host_url}/distrib.rme/distrib=" + +studios_list_url = f"{host_url}/studio.asp" +studios_base_url = f"{host_url}/studio.rme/studio=" + +# 设置 headers 和 scraper +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} +scraper = cloudscraper.create_scraper() + +#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 +def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): + for attempt in range(max_retries): + try: + response = scraper.get(url, headers=headers) + response.raise_for_status() # 处理 HTTP 错误 + + # 预处理 HTML(如果提供了 preprocessor) + html_text = preprocessor(response.text) if preprocessor else response.text + + soup = BeautifulSoup(html_text, parser) + if validator(soup): # 进行自定义页面检查 + return soup + + logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") + except cloudscraper.exceptions.CloudflareChallengeError as e: + logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...") + except cloudscraper.exceptions.CloudflareCode1020 as e: + logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...") + except Exception as e: + logging.error(f"Unexpected error on {url}: {e}, Retring...") + + logging.error(f'Fetching failed after max retries. {url}') + return None # 达到最大重试次数仍然失败 + +# 修复 HTML 结构,去除多余标签并修正 标签,在获取人种的时候需要 +def preprocess_html(html): + return html.replace('
', '').replace('