From eb3b27ecb0ce95c69d1d833c84f3a9b881e4ca2d Mon Sep 17 00:00:00 2001 From: sophon Date: Sun, 27 Jul 2025 19:01:53 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/cron/cmd.txt | 4 +- .../db_wapper/spider_db_handler.py | 57 +++++++------- .../scrapy_proj/spiders/base_spider.py | 9 --- .../scrapy_proj/spiders/iafd_spider.py | 75 ++++++++++++++----- .../scrapy_proj/spiders/parser/iafd_parser.py | 18 +++++ 5 files changed, 107 insertions(+), 56 deletions(-) diff --git a/scrapy_proj/cron/cmd.txt b/scrapy_proj/cron/cmd.txt index 21d07f5..880eb91 100644 --- a/scrapy_proj/cron/cmd.txt +++ b/scrapy_proj/cron/cmd.txt @@ -8,4 +8,6 @@ scrapy crawl pbox -a mod='update' -a begin='2025-07-16' scrapy crawl pbox -a debug=1 -a cmd='studio,movies' scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LEVEL=DEBUG -a mod='update' -a begin='2025-07-16' -scrapy crawl iafd -a debug=1 -a cmd=performers -s STATS_EXPORT_INTERVAL=60 -s LOG_LEVEL=DEBUG \ No newline at end of file +scrapy crawl javbus -a cmd=actors -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ + +scrapy crawl iafd -a cmd='astro,ethnic,dist,stu' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index 50baa5b..4b54ca2 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -1067,7 +1067,7 @@ class IAFDDBHandler(SQLiteDBHandler): # """插入电影索引,来自于列表数据""" #def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None): - def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None): + def insert_movie_index(self, title, href, **kwargs): fields = [ 'from_performer_list', 'from_dist_list', 'from_stu_list', 'release_year' ] @@ -1104,7 +1104,7 @@ class IAFDDBHandler(SQLiteDBHandler): performer_id = self.insert_or_update_common(data=data, tbl_name=self.tbl_name_performers, uniq_key='href', exists_do_nothing=False) if performer_id is None: return None - logging.debug(f"insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}") + logging.debug(f"insert one performer, id: {performer_id}, name: {data['name']}, href: {data['href']}") # 插入新的 alias for alias in data.get("performer_aka", []): @@ -1115,7 +1115,6 @@ class IAFDDBHandler(SQLiteDBHandler): composite_pk = ['performer_id', 'alias'], exists_do_nothing = True ) - conn.commit() # 插入影片列表,可能有 personal 和 director 两个身份 if movies_update: @@ -1126,7 +1125,7 @@ class IAFDDBHandler(SQLiteDBHandler): movie_id = self.get_id_by_key(tbl=self.tbl_name_movies, uniq_key='href', val=movie['href']) # 影片不存在,先插入 if movie_id is None: - movie_id = self.insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']), from_performer_list=1) + movie_id = self.insert_movie_index(movie['title'], movie['href'], release_year=int(movie['year']), from_performer_list=1) if movie_id: tmp_id = self.insert_performer_movie(performer_id, movie_id, role, movie['notes']) if tmp_id : @@ -1137,11 +1136,11 @@ class IAFDDBHandler(SQLiteDBHandler): return performer_id except sqlite3.Error as e: - conn.rollback() + self.conn.rollback() logging.error(f"数据库错误: {e}") return None except Exception as e: - conn.rollback() + self.conn.rollback() logging.error(f"未知错误: {e}") return None @@ -1175,7 +1174,7 @@ class IAFDDBHandler(SQLiteDBHandler): sql += " AND name LIKE ?" params.append(f"%{filters['name']}%") - cursor.execute(sql, params) + self.cursor.execute(sql, params) #return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()] @@ -1203,7 +1202,7 @@ class IAFDDBHandler(SQLiteDBHandler): sql += " AND name LIKE ?" params.append(f"%{filters['name']}%") - cursor.execute(sql, params) + self.cursor.execute(sql, params) return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 except sqlite3.Error as e: @@ -1230,7 +1229,7 @@ class IAFDDBHandler(SQLiteDBHandler): sql += " AND name LIKE ?" params.append(f"%{filters['name']}%") - cursor.execute(sql, params) + self.cursor.execute(sql, params) return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 except sqlite3.Error as e: @@ -1306,7 +1305,7 @@ class IAFDDBHandler(SQLiteDBHandler): return movie_id except Exception as e: - conn.rollback() + self.conn.rollback() logging.error("Error inserting movie: %s", e) return None @@ -1569,7 +1568,7 @@ class IAFDDBHandler(SQLiteDBHandler): if limit is not None: sql += f" LIMIT {limit}" - cursor.execute(sql) + self.cursor.execute(sql) return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()] except sqlite3.Error as e: @@ -1589,19 +1588,19 @@ class IAFDDBHandler(SQLiteDBHandler): "CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);") ] for index_name, create_index_sql in indexes: - cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,)) - if not cursor.fetchone(): - cursor.execute(create_index_sql) + self.cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,)) + if not self.cursor.fetchone(): + self.cursor.execute(create_index_sql) logging.info(f"Index {index_name} created successfully.") else: logging.info(f"Index {index_name} already exists.") # 检查视图是否存在,如果不存在则创建 view_name = f"iafd_tmp_performers_stat_{taskid}" - cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,)) - if cursor.fetchone(): - cursor.execute("drop table ?", (view_name,)) - conn.commit() + self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,)) + if self.cursor.fetchone(): + self.cursor.execute("drop table ?", (view_name,)) + self.conn.commit() create_view_sql = f""" CREATE table {view_name} AS @@ -1646,11 +1645,11 @@ class IAFDDBHandler(SQLiteDBHandler): GROUP BY id, href, name, movies_cnt; """ - cursor.execute(create_view_sql) + self.cursor.execute(create_view_sql) logging.info(f"table {view_name} created successfully.") # 提交更改并关闭连接 - conn.commit() + self.conn.commit() except sqlite3.Error as e: logging.warning(f"An error occurred: {e}") @@ -1659,7 +1658,7 @@ class IAFDDBHandler(SQLiteDBHandler): def reset_actor_movies(self, check_and_do = 0): try: # 检查表中是否已存在movies_cnt列 - cursor.execute(f"PRAGMA table_info(iafd_performers);") + self.cursor.execute(f"PRAGMA table_info(iafd_performers);") columns = [row[1] for row in cursor.fetchall()] if 'movies_cnt' not in columns: @@ -1667,19 +1666,19 @@ class IAFDDBHandler(SQLiteDBHandler): add_field_sql = f""" ALTER TABLE iafd_performers ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL; """ - cursor.execute(add_field_sql) + self.cursor.execute(add_field_sql) logging.info("成功添加movies_cnt字段") else: logging.info("movies_cnt字段已存在,跳过添加") # 确保关联表有索引 - cursor.execute(f""" + self.cursor.execute(f""" CREATE INDEX IF NOT EXISTS idx_iafd_performers_movies_performer_id ON iafd_performers_movies(performer_id); """) # 创建临时表存储统计结果 - cursor.execute(f""" + self.cursor.execute(f""" CREATE TEMPORARY TABLE temp_actor_counts AS SELECT performer_id, COUNT(movie_id) AS cnt FROM iafd_performers_movies @@ -1687,10 +1686,10 @@ class IAFDDBHandler(SQLiteDBHandler): """) # 为临时表添加索引 - cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);") + self.cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);") # 更新主表 - cursor.execute(f""" + self.cursor.execute(f""" UPDATE iafd_performers SET movies_cnt = COALESCE(( SELECT cnt FROM temp_actor_counts @@ -1702,12 +1701,12 @@ class IAFDDBHandler(SQLiteDBHandler): logging.info(f"成功更新{updated_rows}个演员的影片数量") # 清理资源 - cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;") - conn.commit() + self.cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;") + self.conn.commit() logging.info("任务执行完成!") except sqlite3.Error as e: - conn.rollback() + self.conn.rollback() logging.error("Error updating actor movie_cnt: %s", e) diff --git a/scrapy_proj/scrapy_proj/spiders/base_spider.py b/scrapy_proj/scrapy_proj/spiders/base_spider.py index fb7be36..58af748 100644 --- a/scrapy_proj/scrapy_proj/spiders/base_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/base_spider.py @@ -6,15 +6,6 @@ from twisted.internet import reactor, defer, asyncioreactor import time class BaseSpider(scrapy.Spider): - def __init__(self, *args, **kwargs): - self.requested_url = set() - - # 记录本次任务已经发起的请求链接 - def _can_request(self, href): - if href in self.requested_url: - return False - self.requested_url.add(href) - return True def start_requests(self): """统一处理请求生成,兼容不同入口点""" diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py index 047a6c5..61d3ae1 100644 --- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py @@ -44,6 +44,7 @@ class IAFDSpider(BaseSpider): self.existed_movies = {} self.load_existed_actors() self.load_existed_movies() + self.requested_url = set() # 入口函数,由基类的方法触发 def custom_start_requests(self): @@ -112,16 +113,12 @@ class IAFDSpider(BaseSpider): for astro in self.astro_list: url = self.astr_base_url + astro yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro}) - if self.debug: - break def start_birth(self): for month in range(1, 13): for day in range(1, 32): url = self.birth_base_url.format(month=month, day=day) yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day}) - if self.debug: - break async def start(self): # 调用原有 start_requests 方法 @@ -167,13 +164,18 @@ class IAFDSpider(BaseSpider): item['href'] = ethnic_url yield item - yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text}) + yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1}) else: self.logger.warning(f"parse page error. url: {response.url}") # 获得列表,查询详情 def parse_ethnic_page(self, response): ethnic = response.meta['ethnic'] + depth = response.meta.get('depth', 1) + if self.debug and depth>=3: + self.logger.debug(f"debug mode, stop next page. ethnic:{ethnic}, url: {response.url}") + return + data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic) if data: self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}") @@ -181,7 +183,7 @@ class IAFDSpider(BaseSpider): yield from self._create_performer_request(href=item['href'], name=item['person']) if next_url: - yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': text}) + yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic, 'depth':depth+1}) else: self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}") else: @@ -226,7 +228,7 @@ class IAFDSpider(BaseSpider): list_type = response.meta.get('list_type', '') data, next_url = common_parser(html=response.text, page=list_type) if data: - self.logger.debug(f"fetched data from {response.url}, data: {data}") + self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}") for movie in data: yield from self._create_movie_request(href=movie['href'], title=movie['title']) else: @@ -234,7 +236,9 @@ class IAFDSpider(BaseSpider): # 统一处理发起影片查询的请求 def _create_performer_request(self, href, name): - if href != '' and is_valid_url(href): + if href == '': + return + if is_valid_url(href): if self._can_request(href): self.crawler.stats.inc_value(f"{self.name}/actor_all") yield scrapy.Request(href, @@ -242,20 +246,21 @@ class IAFDSpider(BaseSpider): meta={'name': name, 'item_type':'movie'} ) else: - self.logger.warning(f"wrong url. {url}, ignore...") + self.logger.warning(f"wrong url. {href}, ignore...") # 统一处理发起影片查询的请求 def _create_movie_request(self, href, title): - if href != '' and is_valid_url(href): + if href == '': + return + if is_valid_url(href): if self.need_update_movie(href) and self._can_request(href): self.crawler.stats.inc_value(f"{self.name}/movie_all") yield scrapy.Request(href, callback=self.parse_movie_detail_page, - meta={'title': title, 'item_type':'movie'}, - cache=True + meta={'title': title, 'item_type':'movie', 'cache':True} ) else: - self.logger.warning(f"wrong url. {url}, ignore...") + self.logger.warning(f"wrong url. {href}, ignore...") # 演员详情页解析和处理 def parse_person_detail_page(self, response): @@ -264,6 +269,9 @@ class IAFDSpider(BaseSpider): self.logger.debug(f"fetched data from {response.url}, data: {data}") self.crawler.stats.inc_value(f"{self.name}/actor_done") item = IafdPerformersItem() + item['name'] = response.meta.get('name', '') + item['href'] = response.url + item['is_full_data'] = 1 for k, v in data.items(): if k in item.fields: item[k] = v @@ -274,9 +282,9 @@ class IAFDSpider(BaseSpider): for role, movies in data.get('credits', {}).items(): if movies: for item in movies: - yield from self._create_movie_request(href=movie['href'], title=movie['title']) + yield from self._create_movie_request(href=item['href'], title=item['title']) else: - self.logger.warning(f"fetched data error. {response.url}") + self._handle_invalid_response(response) # 影片详情页解析和处理 def parse_movie_detail_page(self, response): @@ -286,6 +294,7 @@ class IAFDSpider(BaseSpider): self.logger.debug(f"fetched data from {response.url}, data: {data}") self.crawler.stats.inc_value(f"{self.name}/movie_done") item = IafdMoviesItem() + item['is_full_data'] = 1 for k, v in data.items(): if k in item.fields: item[k] = v @@ -307,24 +316,39 @@ class IAFDSpider(BaseSpider): yield from self._create_performer_request(href=director['href'], name=director['name']) else: - self.logger.warning(f"fetched data error. {response.url}") + self._handle_invalid_response(response) # 统一判断并处理异常 def _handle_invalid_response(self, response): + update_flag = False if response.status in [200]: if "invalid or outdated page" in response.text.lower(): self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}") # TODO: 更新404的演员或者影片 + update_flag = True else: self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}") elif response.status in [404, 403]: self.logger.warning(f"get 404 page. url: {response.url}") # TODO: 更新404的演员或者影片 - + update_flag = True else: self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}") + if update_flag: + if 'person.rme' in response.url: + item = IafdPerformersItem() + item['href'] = response.url + item['name'] = response.meta.get('name', '') + item['is_full_data'] = 404 + yield item + elif 'title.rme' in response.url: + item = IafdMoviesItem() + item['href'] = response.url + item['title'] = response.meta.get('title', '') + item['is_full_data'] = 404 + yield item def load_existed_actors(self): query_args = {} @@ -366,3 +390,20 @@ class IAFDSpider(BaseSpider): def acc_movie_to_existed(self, href, is_full_data=1): self.existed_movies[href] = is_full_data + + def _can_request(self, href): + if href in self.requested_url: + return False + + if self.debug: # 某些条件下,限定url的发起次数 + keys = ['person.rme', 'title.rme'] + for key in keys: + count = 0 + for url in self.requested_url: + if key.lower() in url.lower(): + count+=1 + if count >=2 and key in href.lower(): + return False + + self.requested_url.add(href) + return True \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py b/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py index d8c93b3..935b345 100644 --- a/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py +++ b/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py @@ -489,6 +489,18 @@ def extract_year_from_date_string(date_str): except TypeError: return 0 +def dist_stu_href_rewrite(href): + # 提取 ID(适用于 distrib 或 studio) + import re + match = re.search(r"(distrib|studio)=(\d+)", href) + if not match: + return None # 不是目标 URL,返回 None + + key, id_number = match.groups() + new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}" + return new_url + + # 解析网页 HTML 并提取电影信息 def parse_page_movie(soup, href, title): # 解析电影基础信息 @@ -518,6 +530,12 @@ def parse_page_movie(soup, href, title): else: return None + if 'DistributorHref' in movie_data and 'distrib' in movie_data['DistributorHref']: + movie_data['DistributorHref'] = dist_stu_href_rewrite(movie_data['DistributorHref']) + + if 'StudioHref' in movie_data and 'studio' in movie_data['StudioHref']: + movie_data['StudioHref'] = dist_stu_href_rewrite(movie_data['StudioHref']) + # 解析演职人员信息 performers = [] cast_divs = soup.find_all("div", class_="castbox")