modify scripts

This commit is contained in:
2025-07-27 19:01:53 +08:00
parent 45271a5b23
commit eb3b27ecb0
5 changed files with 107 additions and 56 deletions

View File

@ -8,4 +8,6 @@ scrapy crawl pbox -a mod='update' -a begin='2025-07-16'
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'
scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LEVEL=DEBUG -a mod='update' -a begin='2025-07-16'
scrapy crawl iafd -a debug=1 -a cmd=performers -s STATS_EXPORT_INTERVAL=60 -s LOG_LEVEL=DEBUG
scrapy crawl javbus -a cmd=actors -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/
scrapy crawl iafd -a cmd='astro,ethnic,dist,stu' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/

View File

@ -1067,7 +1067,7 @@ class IAFDDBHandler(SQLiteDBHandler):
# """插入电影索引,来自于列表数据"""
#def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
def insert_movie_index(self, title, href, **kwargs):
fields = [
'from_performer_list', 'from_dist_list', 'from_stu_list', 'release_year'
]
@ -1104,7 +1104,7 @@ class IAFDDBHandler(SQLiteDBHandler):
performer_id = self.insert_or_update_common(data=data, tbl_name=self.tbl_name_performers, uniq_key='href', exists_do_nothing=False)
if performer_id is None:
return None
logging.debug(f"insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}")
logging.debug(f"insert one performer, id: {performer_id}, name: {data['name']}, href: {data['href']}")
# 插入新的 alias
for alias in data.get("performer_aka", []):
@ -1115,7 +1115,6 @@ class IAFDDBHandler(SQLiteDBHandler):
composite_pk = ['performer_id', 'alias'],
exists_do_nothing = True
)
conn.commit()
# 插入影片列表,可能有 personal 和 director 两个身份
if movies_update:
@ -1126,7 +1125,7 @@ class IAFDDBHandler(SQLiteDBHandler):
movie_id = self.get_id_by_key(tbl=self.tbl_name_movies, uniq_key='href', val=movie['href'])
# 影片不存在,先插入
if movie_id is None:
movie_id = self.insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']), from_performer_list=1)
movie_id = self.insert_movie_index(movie['title'], movie['href'], release_year=int(movie['year']), from_performer_list=1)
if movie_id:
tmp_id = self.insert_performer_movie(performer_id, movie_id, role, movie['notes'])
if tmp_id :
@ -1137,11 +1136,11 @@ class IAFDDBHandler(SQLiteDBHandler):
return performer_id
except sqlite3.Error as e:
conn.rollback()
self.conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
self.conn.rollback()
logging.error(f"未知错误: {e}")
return None
@ -1175,7 +1174,7 @@ class IAFDDBHandler(SQLiteDBHandler):
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
self.cursor.execute(sql, params)
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
@ -1203,7 +1202,7 @@ class IAFDDBHandler(SQLiteDBHandler):
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
self.cursor.execute(sql, params)
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
@ -1230,7 +1229,7 @@ class IAFDDBHandler(SQLiteDBHandler):
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
self.cursor.execute(sql, params)
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
@ -1306,7 +1305,7 @@ class IAFDDBHandler(SQLiteDBHandler):
return movie_id
except Exception as e:
conn.rollback()
self.conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
@ -1569,7 +1568,7 @@ class IAFDDBHandler(SQLiteDBHandler):
if limit is not None:
sql += f" LIMIT {limit}"
cursor.execute(sql)
self.cursor.execute(sql)
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
@ -1589,19 +1588,19 @@ class IAFDDBHandler(SQLiteDBHandler):
"CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);")
]
for index_name, create_index_sql in indexes:
cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
if not cursor.fetchone():
cursor.execute(create_index_sql)
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
if not self.cursor.fetchone():
self.cursor.execute(create_index_sql)
logging.info(f"Index {index_name} created successfully.")
else:
logging.info(f"Index {index_name} already exists.")
# 检查视图是否存在,如果不存在则创建
view_name = f"iafd_tmp_performers_stat_{taskid}"
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
if cursor.fetchone():
cursor.execute("drop table ?", (view_name,))
conn.commit()
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
if self.cursor.fetchone():
self.cursor.execute("drop table ?", (view_name,))
self.conn.commit()
create_view_sql = f"""
CREATE table {view_name} AS
@ -1646,11 +1645,11 @@ class IAFDDBHandler(SQLiteDBHandler):
GROUP BY
id, href, name, movies_cnt;
"""
cursor.execute(create_view_sql)
self.cursor.execute(create_view_sql)
logging.info(f"table {view_name} created successfully.")
# 提交更改并关闭连接
conn.commit()
self.conn.commit()
except sqlite3.Error as e:
logging.warning(f"An error occurred: {e}")
@ -1659,7 +1658,7 @@ class IAFDDBHandler(SQLiteDBHandler):
def reset_actor_movies(self, check_and_do = 0):
try:
# 检查表中是否已存在movies_cnt列
cursor.execute(f"PRAGMA table_info(iafd_performers);")
self.cursor.execute(f"PRAGMA table_info(iafd_performers);")
columns = [row[1] for row in cursor.fetchall()]
if 'movies_cnt' not in columns:
@ -1667,19 +1666,19 @@ class IAFDDBHandler(SQLiteDBHandler):
add_field_sql = f"""
ALTER TABLE iafd_performers ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
"""
cursor.execute(add_field_sql)
self.cursor.execute(add_field_sql)
logging.info("成功添加movies_cnt字段")
else:
logging.info("movies_cnt字段已存在跳过添加")
# 确保关联表有索引
cursor.execute(f"""
self.cursor.execute(f"""
CREATE INDEX IF NOT EXISTS idx_iafd_performers_movies_performer_id
ON iafd_performers_movies(performer_id);
""")
# 创建临时表存储统计结果
cursor.execute(f"""
self.cursor.execute(f"""
CREATE TEMPORARY TABLE temp_actor_counts AS
SELECT performer_id, COUNT(movie_id) AS cnt
FROM iafd_performers_movies
@ -1687,10 +1686,10 @@ class IAFDDBHandler(SQLiteDBHandler):
""")
# 为临时表添加索引
cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);")
self.cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);")
# 更新主表
cursor.execute(f"""
self.cursor.execute(f"""
UPDATE iafd_performers
SET movies_cnt = COALESCE((
SELECT cnt FROM temp_actor_counts
@ -1702,12 +1701,12 @@ class IAFDDBHandler(SQLiteDBHandler):
logging.info(f"成功更新{updated_rows}个演员的影片数量")
# 清理资源
cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;")
conn.commit()
self.cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;")
self.conn.commit()
logging.info("任务执行完成!")
except sqlite3.Error as e:
conn.rollback()
self.conn.rollback()
logging.error("Error updating actor movie_cnt: %s", e)

View File

@ -6,15 +6,6 @@ from twisted.internet import reactor, defer, asyncioreactor
import time
class BaseSpider(scrapy.Spider):
def __init__(self, *args, **kwargs):
self.requested_url = set()
# 记录本次任务已经发起的请求链接
def _can_request(self, href):
if href in self.requested_url:
return False
self.requested_url.add(href)
return True
def start_requests(self):
"""统一处理请求生成,兼容不同入口点"""

View File

@ -44,6 +44,7 @@ class IAFDSpider(BaseSpider):
self.existed_movies = {}
self.load_existed_actors()
self.load_existed_movies()
self.requested_url = set()
# 入口函数,由基类的方法触发
def custom_start_requests(self):
@ -112,16 +113,12 @@ class IAFDSpider(BaseSpider):
for astro in self.astro_list:
url = self.astr_base_url + astro
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
if self.debug:
break
def start_birth(self):
for month in range(1, 13):
for day in range(1, 32):
url = self.birth_base_url.format(month=month, day=day)
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
if self.debug:
break
async def start(self):
# 调用原有 start_requests 方法
@ -167,13 +164,18 @@ class IAFDSpider(BaseSpider):
item['href'] = ethnic_url
yield item
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
else:
self.logger.warning(f"parse page error. url: {response.url}")
# 获得列表,查询详情
def parse_ethnic_page(self, response):
ethnic = response.meta['ethnic']
depth = response.meta.get('depth', 1)
if self.debug and depth>=3:
self.logger.debug(f"debug mode, stop next page. ethnic:{ethnic}, url: {response.url}")
return
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
if data:
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
@ -181,7 +183,7 @@ class IAFDSpider(BaseSpider):
yield from self._create_performer_request(href=item['href'], name=item['person'])
if next_url:
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic, 'depth':depth+1})
else:
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
else:
@ -226,7 +228,7 @@ class IAFDSpider(BaseSpider):
list_type = response.meta.get('list_type', '')
data, next_url = common_parser(html=response.text, page=list_type)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
for movie in data:
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
else:
@ -234,7 +236,9 @@ class IAFDSpider(BaseSpider):
# 统一处理发起影片查询的请求
def _create_performer_request(self, href, name):
if href != '' and is_valid_url(href):
if href == '':
return
if is_valid_url(href):
if self._can_request(href):
self.crawler.stats.inc_value(f"{self.name}/actor_all")
yield scrapy.Request(href,
@ -242,20 +246,21 @@ class IAFDSpider(BaseSpider):
meta={'name': name, 'item_type':'movie'}
)
else:
self.logger.warning(f"wrong url. {url}, ignore...")
self.logger.warning(f"wrong url. {href}, ignore...")
# 统一处理发起影片查询的请求
def _create_movie_request(self, href, title):
if href != '' and is_valid_url(href):
if href == '':
return
if is_valid_url(href):
if self.need_update_movie(href) and self._can_request(href):
self.crawler.stats.inc_value(f"{self.name}/movie_all")
yield scrapy.Request(href,
callback=self.parse_movie_detail_page,
meta={'title': title, 'item_type':'movie'},
cache=True
meta={'title': title, 'item_type':'movie', 'cache':True}
)
else:
self.logger.warning(f"wrong url. {url}, ignore...")
self.logger.warning(f"wrong url. {href}, ignore...")
# 演员详情页解析和处理
def parse_person_detail_page(self, response):
@ -264,6 +269,9 @@ class IAFDSpider(BaseSpider):
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.crawler.stats.inc_value(f"{self.name}/actor_done")
item = IafdPerformersItem()
item['name'] = response.meta.get('name', '')
item['href'] = response.url
item['is_full_data'] = 1
for k, v in data.items():
if k in item.fields:
item[k] = v
@ -274,9 +282,9 @@ class IAFDSpider(BaseSpider):
for role, movies in data.get('credits', {}).items():
if movies:
for item in movies:
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
yield from self._create_movie_request(href=item['href'], title=item['title'])
else:
self.logger.warning(f"fetched data error. {response.url}")
self._handle_invalid_response(response)
# 影片详情页解析和处理
def parse_movie_detail_page(self, response):
@ -286,6 +294,7 @@ class IAFDSpider(BaseSpider):
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.crawler.stats.inc_value(f"{self.name}/movie_done")
item = IafdMoviesItem()
item['is_full_data'] = 1
for k, v in data.items():
if k in item.fields:
item[k] = v
@ -307,24 +316,39 @@ class IAFDSpider(BaseSpider):
yield from self._create_performer_request(href=director['href'], name=director['name'])
else:
self.logger.warning(f"fetched data error. {response.url}")
self._handle_invalid_response(response)
# 统一判断并处理异常
def _handle_invalid_response(self, response):
update_flag = False
if response.status in [200]:
if "invalid or outdated page" in response.text.lower():
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
# TODO: 更新404的演员或者影片
update_flag = True
else:
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
elif response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}")
# TODO: 更新404的演员或者影片
update_flag = True
else:
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
if update_flag:
if 'person.rme' in response.url:
item = IafdPerformersItem()
item['href'] = response.url
item['name'] = response.meta.get('name', '')
item['is_full_data'] = 404
yield item
elif 'title.rme' in response.url:
item = IafdMoviesItem()
item['href'] = response.url
item['title'] = response.meta.get('title', '')
item['is_full_data'] = 404
yield item
def load_existed_actors(self):
query_args = {}
@ -366,3 +390,20 @@ class IAFDSpider(BaseSpider):
def acc_movie_to_existed(self, href, is_full_data=1):
self.existed_movies[href] = is_full_data
def _can_request(self, href):
if href in self.requested_url:
return False
if self.debug: # 某些条件下限定url的发起次数
keys = ['person.rme', 'title.rme']
for key in keys:
count = 0
for url in self.requested_url:
if key.lower() in url.lower():
count+=1
if count >=2 and key in href.lower():
return False
self.requested_url.add(href)
return True

View File

@ -489,6 +489,18 @@ def extract_year_from_date_string(date_str):
except TypeError:
return 0
def dist_stu_href_rewrite(href):
# 提取 ID适用于 distrib 或 studio
import re
match = re.search(r"(distrib|studio)=(\d+)", href)
if not match:
return None # 不是目标 URL返回 None
key, id_number = match.groups()
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
return new_url
# 解析网页 HTML 并提取电影信息
def parse_page_movie(soup, href, title):
# 解析电影基础信息
@ -518,6 +530,12 @@ def parse_page_movie(soup, href, title):
else:
return None
if 'DistributorHref' in movie_data and 'distrib' in movie_data['DistributorHref']:
movie_data['DistributorHref'] = dist_stu_href_rewrite(movie_data['DistributorHref'])
if 'StudioHref' in movie_data and 'studio' in movie_data['StudioHref']:
movie_data['StudioHref'] = dist_stu_href_rewrite(movie_data['StudioHref'])
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")