modify scripts
This commit is contained in:
@ -8,4 +8,6 @@ scrapy crawl pbox -a mod='update' -a begin='2025-07-16'
|
|||||||
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'
|
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'
|
||||||
scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LEVEL=DEBUG -a mod='update' -a begin='2025-07-16'
|
scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LEVEL=DEBUG -a mod='update' -a begin='2025-07-16'
|
||||||
|
|
||||||
scrapy crawl iafd -a debug=1 -a cmd=performers -s STATS_EXPORT_INTERVAL=60 -s LOG_LEVEL=DEBUG
|
scrapy crawl javbus -a cmd=actors -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/
|
||||||
|
|
||||||
|
scrapy crawl iafd -a cmd='astro,ethnic,dist,stu' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/
|
||||||
@ -1067,7 +1067,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
|
|
||||||
# """插入电影索引,来自于列表数据"""
|
# """插入电影索引,来自于列表数据"""
|
||||||
#def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
|
#def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
|
||||||
def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
|
def insert_movie_index(self, title, href, **kwargs):
|
||||||
fields = [
|
fields = [
|
||||||
'from_performer_list', 'from_dist_list', 'from_stu_list', 'release_year'
|
'from_performer_list', 'from_dist_list', 'from_stu_list', 'release_year'
|
||||||
]
|
]
|
||||||
@ -1104,7 +1104,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
performer_id = self.insert_or_update_common(data=data, tbl_name=self.tbl_name_performers, uniq_key='href', exists_do_nothing=False)
|
performer_id = self.insert_or_update_common(data=data, tbl_name=self.tbl_name_performers, uniq_key='href', exists_do_nothing=False)
|
||||||
if performer_id is None:
|
if performer_id is None:
|
||||||
return None
|
return None
|
||||||
logging.debug(f"insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}")
|
logging.debug(f"insert one performer, id: {performer_id}, name: {data['name']}, href: {data['href']}")
|
||||||
|
|
||||||
# 插入新的 alias
|
# 插入新的 alias
|
||||||
for alias in data.get("performer_aka", []):
|
for alias in data.get("performer_aka", []):
|
||||||
@ -1115,7 +1115,6 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
composite_pk = ['performer_id', 'alias'],
|
composite_pk = ['performer_id', 'alias'],
|
||||||
exists_do_nothing = True
|
exists_do_nothing = True
|
||||||
)
|
)
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
# 插入影片列表,可能有 personal 和 director 两个身份
|
# 插入影片列表,可能有 personal 和 director 两个身份
|
||||||
if movies_update:
|
if movies_update:
|
||||||
@ -1126,7 +1125,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
movie_id = self.get_id_by_key(tbl=self.tbl_name_movies, uniq_key='href', val=movie['href'])
|
movie_id = self.get_id_by_key(tbl=self.tbl_name_movies, uniq_key='href', val=movie['href'])
|
||||||
# 影片不存在,先插入
|
# 影片不存在,先插入
|
||||||
if movie_id is None:
|
if movie_id is None:
|
||||||
movie_id = self.insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']), from_performer_list=1)
|
movie_id = self.insert_movie_index(movie['title'], movie['href'], release_year=int(movie['year']), from_performer_list=1)
|
||||||
if movie_id:
|
if movie_id:
|
||||||
tmp_id = self.insert_performer_movie(performer_id, movie_id, role, movie['notes'])
|
tmp_id = self.insert_performer_movie(performer_id, movie_id, role, movie['notes'])
|
||||||
if tmp_id :
|
if tmp_id :
|
||||||
@ -1137,11 +1136,11 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
return performer_id
|
return performer_id
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
conn.rollback()
|
self.conn.rollback()
|
||||||
logging.error(f"数据库错误: {e}")
|
logging.error(f"数据库错误: {e}")
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conn.rollback()
|
self.conn.rollback()
|
||||||
logging.error(f"未知错误: {e}")
|
logging.error(f"未知错误: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -1175,7 +1174,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
sql += " AND name LIKE ?"
|
sql += " AND name LIKE ?"
|
||||||
params.append(f"%{filters['name']}%")
|
params.append(f"%{filters['name']}%")
|
||||||
|
|
||||||
cursor.execute(sql, params)
|
self.cursor.execute(sql, params)
|
||||||
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||||
|
|
||||||
@ -1203,7 +1202,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
sql += " AND name LIKE ?"
|
sql += " AND name LIKE ?"
|
||||||
params.append(f"%{filters['name']}%")
|
params.append(f"%{filters['name']}%")
|
||||||
|
|
||||||
cursor.execute(sql, params)
|
self.cursor.execute(sql, params)
|
||||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
@ -1230,7 +1229,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
sql += " AND name LIKE ?"
|
sql += " AND name LIKE ?"
|
||||||
params.append(f"%{filters['name']}%")
|
params.append(f"%{filters['name']}%")
|
||||||
|
|
||||||
cursor.execute(sql, params)
|
self.cursor.execute(sql, params)
|
||||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
@ -1306,7 +1305,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
return movie_id
|
return movie_id
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conn.rollback()
|
self.conn.rollback()
|
||||||
logging.error("Error inserting movie: %s", e)
|
logging.error("Error inserting movie: %s", e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -1569,7 +1568,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
if limit is not None:
|
if limit is not None:
|
||||||
sql += f" LIMIT {limit}"
|
sql += f" LIMIT {limit}"
|
||||||
|
|
||||||
cursor.execute(sql)
|
self.cursor.execute(sql)
|
||||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
@ -1589,19 +1588,19 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
"CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);")
|
"CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);")
|
||||||
]
|
]
|
||||||
for index_name, create_index_sql in indexes:
|
for index_name, create_index_sql in indexes:
|
||||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
|
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
|
||||||
if not cursor.fetchone():
|
if not self.cursor.fetchone():
|
||||||
cursor.execute(create_index_sql)
|
self.cursor.execute(create_index_sql)
|
||||||
logging.info(f"Index {index_name} created successfully.")
|
logging.info(f"Index {index_name} created successfully.")
|
||||||
else:
|
else:
|
||||||
logging.info(f"Index {index_name} already exists.")
|
logging.info(f"Index {index_name} already exists.")
|
||||||
|
|
||||||
# 检查视图是否存在,如果不存在则创建
|
# 检查视图是否存在,如果不存在则创建
|
||||||
view_name = f"iafd_tmp_performers_stat_{taskid}"
|
view_name = f"iafd_tmp_performers_stat_{taskid}"
|
||||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
|
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
|
||||||
if cursor.fetchone():
|
if self.cursor.fetchone():
|
||||||
cursor.execute("drop table ?", (view_name,))
|
self.cursor.execute("drop table ?", (view_name,))
|
||||||
conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
create_view_sql = f"""
|
create_view_sql = f"""
|
||||||
CREATE table {view_name} AS
|
CREATE table {view_name} AS
|
||||||
@ -1646,11 +1645,11 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
GROUP BY
|
GROUP BY
|
||||||
id, href, name, movies_cnt;
|
id, href, name, movies_cnt;
|
||||||
"""
|
"""
|
||||||
cursor.execute(create_view_sql)
|
self.cursor.execute(create_view_sql)
|
||||||
logging.info(f"table {view_name} created successfully.")
|
logging.info(f"table {view_name} created successfully.")
|
||||||
|
|
||||||
# 提交更改并关闭连接
|
# 提交更改并关闭连接
|
||||||
conn.commit()
|
self.conn.commit()
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logging.warning(f"An error occurred: {e}")
|
logging.warning(f"An error occurred: {e}")
|
||||||
|
|
||||||
@ -1659,7 +1658,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
def reset_actor_movies(self, check_and_do = 0):
|
def reset_actor_movies(self, check_and_do = 0):
|
||||||
try:
|
try:
|
||||||
# 检查表中是否已存在movies_cnt列
|
# 检查表中是否已存在movies_cnt列
|
||||||
cursor.execute(f"PRAGMA table_info(iafd_performers);")
|
self.cursor.execute(f"PRAGMA table_info(iafd_performers);")
|
||||||
columns = [row[1] for row in cursor.fetchall()]
|
columns = [row[1] for row in cursor.fetchall()]
|
||||||
|
|
||||||
if 'movies_cnt' not in columns:
|
if 'movies_cnt' not in columns:
|
||||||
@ -1667,19 +1666,19 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
add_field_sql = f"""
|
add_field_sql = f"""
|
||||||
ALTER TABLE iafd_performers ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
|
ALTER TABLE iafd_performers ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
|
||||||
"""
|
"""
|
||||||
cursor.execute(add_field_sql)
|
self.cursor.execute(add_field_sql)
|
||||||
logging.info("成功添加movies_cnt字段")
|
logging.info("成功添加movies_cnt字段")
|
||||||
else:
|
else:
|
||||||
logging.info("movies_cnt字段已存在,跳过添加")
|
logging.info("movies_cnt字段已存在,跳过添加")
|
||||||
|
|
||||||
# 确保关联表有索引
|
# 确保关联表有索引
|
||||||
cursor.execute(f"""
|
self.cursor.execute(f"""
|
||||||
CREATE INDEX IF NOT EXISTS idx_iafd_performers_movies_performer_id
|
CREATE INDEX IF NOT EXISTS idx_iafd_performers_movies_performer_id
|
||||||
ON iafd_performers_movies(performer_id);
|
ON iafd_performers_movies(performer_id);
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# 创建临时表存储统计结果
|
# 创建临时表存储统计结果
|
||||||
cursor.execute(f"""
|
self.cursor.execute(f"""
|
||||||
CREATE TEMPORARY TABLE temp_actor_counts AS
|
CREATE TEMPORARY TABLE temp_actor_counts AS
|
||||||
SELECT performer_id, COUNT(movie_id) AS cnt
|
SELECT performer_id, COUNT(movie_id) AS cnt
|
||||||
FROM iafd_performers_movies
|
FROM iafd_performers_movies
|
||||||
@ -1687,10 +1686,10 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
""")
|
""")
|
||||||
|
|
||||||
# 为临时表添加索引
|
# 为临时表添加索引
|
||||||
cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);")
|
self.cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);")
|
||||||
|
|
||||||
# 更新主表
|
# 更新主表
|
||||||
cursor.execute(f"""
|
self.cursor.execute(f"""
|
||||||
UPDATE iafd_performers
|
UPDATE iafd_performers
|
||||||
SET movies_cnt = COALESCE((
|
SET movies_cnt = COALESCE((
|
||||||
SELECT cnt FROM temp_actor_counts
|
SELECT cnt FROM temp_actor_counts
|
||||||
@ -1702,12 +1701,12 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
logging.info(f"成功更新{updated_rows}个演员的影片数量")
|
logging.info(f"成功更新{updated_rows}个演员的影片数量")
|
||||||
|
|
||||||
# 清理资源
|
# 清理资源
|
||||||
cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;")
|
self.cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;")
|
||||||
conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
logging.info("任务执行完成!")
|
logging.info("任务执行完成!")
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
conn.rollback()
|
self.conn.rollback()
|
||||||
logging.error("Error updating actor movie_cnt: %s", e)
|
logging.error("Error updating actor movie_cnt: %s", e)
|
||||||
|
|
||||||
|
|||||||
@ -6,15 +6,6 @@ from twisted.internet import reactor, defer, asyncioreactor
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
class BaseSpider(scrapy.Spider):
|
class BaseSpider(scrapy.Spider):
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
self.requested_url = set()
|
|
||||||
|
|
||||||
# 记录本次任务已经发起的请求链接
|
|
||||||
def _can_request(self, href):
|
|
||||||
if href in self.requested_url:
|
|
||||||
return False
|
|
||||||
self.requested_url.add(href)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
"""统一处理请求生成,兼容不同入口点"""
|
"""统一处理请求生成,兼容不同入口点"""
|
||||||
|
|||||||
@ -44,6 +44,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
self.existed_movies = {}
|
self.existed_movies = {}
|
||||||
self.load_existed_actors()
|
self.load_existed_actors()
|
||||||
self.load_existed_movies()
|
self.load_existed_movies()
|
||||||
|
self.requested_url = set()
|
||||||
|
|
||||||
# 入口函数,由基类的方法触发
|
# 入口函数,由基类的方法触发
|
||||||
def custom_start_requests(self):
|
def custom_start_requests(self):
|
||||||
@ -112,16 +113,12 @@ class IAFDSpider(BaseSpider):
|
|||||||
for astro in self.astro_list:
|
for astro in self.astro_list:
|
||||||
url = self.astr_base_url + astro
|
url = self.astr_base_url + astro
|
||||||
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
|
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
|
||||||
if self.debug:
|
|
||||||
break
|
|
||||||
|
|
||||||
def start_birth(self):
|
def start_birth(self):
|
||||||
for month in range(1, 13):
|
for month in range(1, 13):
|
||||||
for day in range(1, 32):
|
for day in range(1, 32):
|
||||||
url = self.birth_base_url.format(month=month, day=day)
|
url = self.birth_base_url.format(month=month, day=day)
|
||||||
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
|
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
|
||||||
if self.debug:
|
|
||||||
break
|
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
# 调用原有 start_requests 方法
|
# 调用原有 start_requests 方法
|
||||||
@ -167,13 +164,18 @@ class IAFDSpider(BaseSpider):
|
|||||||
item['href'] = ethnic_url
|
item['href'] = ethnic_url
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse page error. url: {response.url}")
|
self.logger.warning(f"parse page error. url: {response.url}")
|
||||||
|
|
||||||
# 获得列表,查询详情
|
# 获得列表,查询详情
|
||||||
def parse_ethnic_page(self, response):
|
def parse_ethnic_page(self, response):
|
||||||
ethnic = response.meta['ethnic']
|
ethnic = response.meta['ethnic']
|
||||||
|
depth = response.meta.get('depth', 1)
|
||||||
|
if self.debug and depth>=3:
|
||||||
|
self.logger.debug(f"debug mode, stop next page. ethnic:{ethnic}, url: {response.url}")
|
||||||
|
return
|
||||||
|
|
||||||
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
||||||
if data:
|
if data:
|
||||||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||||
@ -181,7 +183,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||||
|
|
||||||
if next_url:
|
if next_url:
|
||||||
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic, 'depth':depth+1})
|
||||||
else:
|
else:
|
||||||
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
||||||
else:
|
else:
|
||||||
@ -226,7 +228,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
list_type = response.meta.get('list_type', '')
|
list_type = response.meta.get('list_type', '')
|
||||||
data, next_url = common_parser(html=response.text, page=list_type)
|
data, next_url = common_parser(html=response.text, page=list_type)
|
||||||
if data:
|
if data:
|
||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||||
for movie in data:
|
for movie in data:
|
||||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||||
else:
|
else:
|
||||||
@ -234,7 +236,9 @@ class IAFDSpider(BaseSpider):
|
|||||||
|
|
||||||
# 统一处理发起影片查询的请求
|
# 统一处理发起影片查询的请求
|
||||||
def _create_performer_request(self, href, name):
|
def _create_performer_request(self, href, name):
|
||||||
if href != '' and is_valid_url(href):
|
if href == '':
|
||||||
|
return
|
||||||
|
if is_valid_url(href):
|
||||||
if self._can_request(href):
|
if self._can_request(href):
|
||||||
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
||||||
yield scrapy.Request(href,
|
yield scrapy.Request(href,
|
||||||
@ -242,20 +246,21 @@ class IAFDSpider(BaseSpider):
|
|||||||
meta={'name': name, 'item_type':'movie'}
|
meta={'name': name, 'item_type':'movie'}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"wrong url. {url}, ignore...")
|
self.logger.warning(f"wrong url. {href}, ignore...")
|
||||||
|
|
||||||
# 统一处理发起影片查询的请求
|
# 统一处理发起影片查询的请求
|
||||||
def _create_movie_request(self, href, title):
|
def _create_movie_request(self, href, title):
|
||||||
if href != '' and is_valid_url(href):
|
if href == '':
|
||||||
|
return
|
||||||
|
if is_valid_url(href):
|
||||||
if self.need_update_movie(href) and self._can_request(href):
|
if self.need_update_movie(href) and self._can_request(href):
|
||||||
self.crawler.stats.inc_value(f"{self.name}/movie_all")
|
self.crawler.stats.inc_value(f"{self.name}/movie_all")
|
||||||
yield scrapy.Request(href,
|
yield scrapy.Request(href,
|
||||||
callback=self.parse_movie_detail_page,
|
callback=self.parse_movie_detail_page,
|
||||||
meta={'title': title, 'item_type':'movie'},
|
meta={'title': title, 'item_type':'movie', 'cache':True}
|
||||||
cache=True
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"wrong url. {url}, ignore...")
|
self.logger.warning(f"wrong url. {href}, ignore...")
|
||||||
|
|
||||||
# 演员详情页解析和处理
|
# 演员详情页解析和处理
|
||||||
def parse_person_detail_page(self, response):
|
def parse_person_detail_page(self, response):
|
||||||
@ -264,6 +269,9 @@ class IAFDSpider(BaseSpider):
|
|||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||||||
item = IafdPerformersItem()
|
item = IafdPerformersItem()
|
||||||
|
item['name'] = response.meta.get('name', '')
|
||||||
|
item['href'] = response.url
|
||||||
|
item['is_full_data'] = 1
|
||||||
for k, v in data.items():
|
for k, v in data.items():
|
||||||
if k in item.fields:
|
if k in item.fields:
|
||||||
item[k] = v
|
item[k] = v
|
||||||
@ -274,9 +282,9 @@ class IAFDSpider(BaseSpider):
|
|||||||
for role, movies in data.get('credits', {}).items():
|
for role, movies in data.get('credits', {}).items():
|
||||||
if movies:
|
if movies:
|
||||||
for item in movies:
|
for item in movies:
|
||||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
yield from self._create_movie_request(href=item['href'], title=item['title'])
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"fetched data error. {response.url}")
|
self._handle_invalid_response(response)
|
||||||
|
|
||||||
# 影片详情页解析和处理
|
# 影片详情页解析和处理
|
||||||
def parse_movie_detail_page(self, response):
|
def parse_movie_detail_page(self, response):
|
||||||
@ -286,6 +294,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
self.crawler.stats.inc_value(f"{self.name}/movie_done")
|
self.crawler.stats.inc_value(f"{self.name}/movie_done")
|
||||||
item = IafdMoviesItem()
|
item = IafdMoviesItem()
|
||||||
|
item['is_full_data'] = 1
|
||||||
for k, v in data.items():
|
for k, v in data.items():
|
||||||
if k in item.fields:
|
if k in item.fields:
|
||||||
item[k] = v
|
item[k] = v
|
||||||
@ -307,24 +316,39 @@ class IAFDSpider(BaseSpider):
|
|||||||
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"fetched data error. {response.url}")
|
self._handle_invalid_response(response)
|
||||||
|
|
||||||
# 统一判断并处理异常
|
# 统一判断并处理异常
|
||||||
def _handle_invalid_response(self, response):
|
def _handle_invalid_response(self, response):
|
||||||
|
update_flag = False
|
||||||
if response.status in [200]:
|
if response.status in [200]:
|
||||||
if "invalid or outdated page" in response.text.lower():
|
if "invalid or outdated page" in response.text.lower():
|
||||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
||||||
# TODO: 更新404的演员或者影片
|
# TODO: 更新404的演员或者影片
|
||||||
|
update_flag = True
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||||
|
|
||||||
elif response.status in [404, 403]:
|
elif response.status in [404, 403]:
|
||||||
self.logger.warning(f"get 404 page. url: {response.url}")
|
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||||
# TODO: 更新404的演员或者影片
|
# TODO: 更新404的演员或者影片
|
||||||
|
update_flag = True
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||||
|
|
||||||
|
if update_flag:
|
||||||
|
if 'person.rme' in response.url:
|
||||||
|
item = IafdPerformersItem()
|
||||||
|
item['href'] = response.url
|
||||||
|
item['name'] = response.meta.get('name', '')
|
||||||
|
item['is_full_data'] = 404
|
||||||
|
yield item
|
||||||
|
elif 'title.rme' in response.url:
|
||||||
|
item = IafdMoviesItem()
|
||||||
|
item['href'] = response.url
|
||||||
|
item['title'] = response.meta.get('title', '')
|
||||||
|
item['is_full_data'] = 404
|
||||||
|
yield item
|
||||||
|
|
||||||
def load_existed_actors(self):
|
def load_existed_actors(self):
|
||||||
query_args = {}
|
query_args = {}
|
||||||
@ -366,3 +390,20 @@ class IAFDSpider(BaseSpider):
|
|||||||
|
|
||||||
def acc_movie_to_existed(self, href, is_full_data=1):
|
def acc_movie_to_existed(self, href, is_full_data=1):
|
||||||
self.existed_movies[href] = is_full_data
|
self.existed_movies[href] = is_full_data
|
||||||
|
|
||||||
|
def _can_request(self, href):
|
||||||
|
if href in self.requested_url:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if self.debug: # 某些条件下,限定url的发起次数
|
||||||
|
keys = ['person.rme', 'title.rme']
|
||||||
|
for key in keys:
|
||||||
|
count = 0
|
||||||
|
for url in self.requested_url:
|
||||||
|
if key.lower() in url.lower():
|
||||||
|
count+=1
|
||||||
|
if count >=2 and key in href.lower():
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.requested_url.add(href)
|
||||||
|
return True
|
||||||
@ -489,6 +489,18 @@ def extract_year_from_date_string(date_str):
|
|||||||
except TypeError:
|
except TypeError:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def dist_stu_href_rewrite(href):
|
||||||
|
# 提取 ID(适用于 distrib 或 studio)
|
||||||
|
import re
|
||||||
|
match = re.search(r"(distrib|studio)=(\d+)", href)
|
||||||
|
if not match:
|
||||||
|
return None # 不是目标 URL,返回 None
|
||||||
|
|
||||||
|
key, id_number = match.groups()
|
||||||
|
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
|
||||||
|
return new_url
|
||||||
|
|
||||||
|
|
||||||
# 解析网页 HTML 并提取电影信息
|
# 解析网页 HTML 并提取电影信息
|
||||||
def parse_page_movie(soup, href, title):
|
def parse_page_movie(soup, href, title):
|
||||||
# 解析电影基础信息
|
# 解析电影基础信息
|
||||||
@ -518,6 +530,12 @@ def parse_page_movie(soup, href, title):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
if 'DistributorHref' in movie_data and 'distrib' in movie_data['DistributorHref']:
|
||||||
|
movie_data['DistributorHref'] = dist_stu_href_rewrite(movie_data['DistributorHref'])
|
||||||
|
|
||||||
|
if 'StudioHref' in movie_data and 'studio' in movie_data['StudioHref']:
|
||||||
|
movie_data['StudioHref'] = dist_stu_href_rewrite(movie_data['StudioHref'])
|
||||||
|
|
||||||
# 解析演职人员信息
|
# 解析演职人员信息
|
||||||
performers = []
|
performers = []
|
||||||
cast_divs = soup.find_all("div", class_="castbox")
|
cast_divs = soup.find_all("div", class_="castbox")
|
||||||
|
|||||||
Reference in New Issue
Block a user