modify scripts
This commit is contained in:
@ -8,4 +8,6 @@ scrapy crawl pbox -a mod='update' -a begin='2025-07-16'
|
||||
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'
|
||||
scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LEVEL=DEBUG -a mod='update' -a begin='2025-07-16'
|
||||
|
||||
scrapy crawl iafd -a debug=1 -a cmd=performers -s STATS_EXPORT_INTERVAL=60 -s LOG_LEVEL=DEBUG
|
||||
scrapy crawl javbus -a cmd=actors -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/
|
||||
|
||||
scrapy crawl iafd -a cmd='astro,ethnic,dist,stu' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/
|
||||
@ -1067,7 +1067,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
|
||||
# """插入电影索引,来自于列表数据"""
|
||||
#def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
|
||||
def insert_movie_index(self, title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
|
||||
def insert_movie_index(self, title, href, **kwargs):
|
||||
fields = [
|
||||
'from_performer_list', 'from_dist_list', 'from_stu_list', 'release_year'
|
||||
]
|
||||
@ -1104,7 +1104,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
performer_id = self.insert_or_update_common(data=data, tbl_name=self.tbl_name_performers, uniq_key='href', exists_do_nothing=False)
|
||||
if performer_id is None:
|
||||
return None
|
||||
logging.debug(f"insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}")
|
||||
logging.debug(f"insert one performer, id: {performer_id}, name: {data['name']}, href: {data['href']}")
|
||||
|
||||
# 插入新的 alias
|
||||
for alias in data.get("performer_aka", []):
|
||||
@ -1115,7 +1115,6 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
composite_pk = ['performer_id', 'alias'],
|
||||
exists_do_nothing = True
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# 插入影片列表,可能有 personal 和 director 两个身份
|
||||
if movies_update:
|
||||
@ -1126,7 +1125,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
movie_id = self.get_id_by_key(tbl=self.tbl_name_movies, uniq_key='href', val=movie['href'])
|
||||
# 影片不存在,先插入
|
||||
if movie_id is None:
|
||||
movie_id = self.insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']), from_performer_list=1)
|
||||
movie_id = self.insert_movie_index(movie['title'], movie['href'], release_year=int(movie['year']), from_performer_list=1)
|
||||
if movie_id:
|
||||
tmp_id = self.insert_performer_movie(performer_id, movie_id, role, movie['notes'])
|
||||
if tmp_id :
|
||||
@ -1137,11 +1136,11 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
return performer_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
self.conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
self.conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
@ -1175,7 +1174,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
self.cursor.execute(sql, params)
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
@ -1203,7 +1202,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
self.cursor.execute(sql, params)
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
@ -1230,7 +1229,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
self.cursor.execute(sql, params)
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
@ -1306,7 +1305,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
self.conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
@ -1569,7 +1568,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
if limit is not None:
|
||||
sql += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(sql)
|
||||
self.cursor.execute(sql)
|
||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
@ -1589,19 +1588,19 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
"CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);")
|
||||
]
|
||||
for index_name, create_index_sql in indexes:
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
|
||||
if not cursor.fetchone():
|
||||
cursor.execute(create_index_sql)
|
||||
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
|
||||
if not self.cursor.fetchone():
|
||||
self.cursor.execute(create_index_sql)
|
||||
logging.info(f"Index {index_name} created successfully.")
|
||||
else:
|
||||
logging.info(f"Index {index_name} already exists.")
|
||||
|
||||
# 检查视图是否存在,如果不存在则创建
|
||||
view_name = f"iafd_tmp_performers_stat_{taskid}"
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
|
||||
if cursor.fetchone():
|
||||
cursor.execute("drop table ?", (view_name,))
|
||||
conn.commit()
|
||||
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
|
||||
if self.cursor.fetchone():
|
||||
self.cursor.execute("drop table ?", (view_name,))
|
||||
self.conn.commit()
|
||||
|
||||
create_view_sql = f"""
|
||||
CREATE table {view_name} AS
|
||||
@ -1646,11 +1645,11 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
GROUP BY
|
||||
id, href, name, movies_cnt;
|
||||
"""
|
||||
cursor.execute(create_view_sql)
|
||||
self.cursor.execute(create_view_sql)
|
||||
logging.info(f"table {view_name} created successfully.")
|
||||
|
||||
# 提交更改并关闭连接
|
||||
conn.commit()
|
||||
self.conn.commit()
|
||||
except sqlite3.Error as e:
|
||||
logging.warning(f"An error occurred: {e}")
|
||||
|
||||
@ -1659,7 +1658,7 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
def reset_actor_movies(self, check_and_do = 0):
|
||||
try:
|
||||
# 检查表中是否已存在movies_cnt列
|
||||
cursor.execute(f"PRAGMA table_info(iafd_performers);")
|
||||
self.cursor.execute(f"PRAGMA table_info(iafd_performers);")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
|
||||
if 'movies_cnt' not in columns:
|
||||
@ -1667,19 +1666,19 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
add_field_sql = f"""
|
||||
ALTER TABLE iafd_performers ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
|
||||
"""
|
||||
cursor.execute(add_field_sql)
|
||||
self.cursor.execute(add_field_sql)
|
||||
logging.info("成功添加movies_cnt字段")
|
||||
else:
|
||||
logging.info("movies_cnt字段已存在,跳过添加")
|
||||
|
||||
# 确保关联表有索引
|
||||
cursor.execute(f"""
|
||||
self.cursor.execute(f"""
|
||||
CREATE INDEX IF NOT EXISTS idx_iafd_performers_movies_performer_id
|
||||
ON iafd_performers_movies(performer_id);
|
||||
""")
|
||||
|
||||
# 创建临时表存储统计结果
|
||||
cursor.execute(f"""
|
||||
self.cursor.execute(f"""
|
||||
CREATE TEMPORARY TABLE temp_actor_counts AS
|
||||
SELECT performer_id, COUNT(movie_id) AS cnt
|
||||
FROM iafd_performers_movies
|
||||
@ -1687,10 +1686,10 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
""")
|
||||
|
||||
# 为临时表添加索引
|
||||
cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);")
|
||||
self.cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);")
|
||||
|
||||
# 更新主表
|
||||
cursor.execute(f"""
|
||||
self.cursor.execute(f"""
|
||||
UPDATE iafd_performers
|
||||
SET movies_cnt = COALESCE((
|
||||
SELECT cnt FROM temp_actor_counts
|
||||
@ -1702,12 +1701,12 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
logging.info(f"成功更新{updated_rows}个演员的影片数量")
|
||||
|
||||
# 清理资源
|
||||
cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;")
|
||||
conn.commit()
|
||||
self.cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;")
|
||||
self.conn.commit()
|
||||
|
||||
logging.info("任务执行完成!")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
self.conn.rollback()
|
||||
logging.error("Error updating actor movie_cnt: %s", e)
|
||||
|
||||
|
||||
@ -6,15 +6,6 @@ from twisted.internet import reactor, defer, asyncioreactor
|
||||
import time
|
||||
|
||||
class BaseSpider(scrapy.Spider):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.requested_url = set()
|
||||
|
||||
# 记录本次任务已经发起的请求链接
|
||||
def _can_request(self, href):
|
||||
if href in self.requested_url:
|
||||
return False
|
||||
self.requested_url.add(href)
|
||||
return True
|
||||
|
||||
def start_requests(self):
|
||||
"""统一处理请求生成,兼容不同入口点"""
|
||||
|
||||
@ -44,6 +44,7 @@ class IAFDSpider(BaseSpider):
|
||||
self.existed_movies = {}
|
||||
self.load_existed_actors()
|
||||
self.load_existed_movies()
|
||||
self.requested_url = set()
|
||||
|
||||
# 入口函数,由基类的方法触发
|
||||
def custom_start_requests(self):
|
||||
@ -112,16 +113,12 @@ class IAFDSpider(BaseSpider):
|
||||
for astro in self.astro_list:
|
||||
url = self.astr_base_url + astro
|
||||
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
|
||||
if self.debug:
|
||||
break
|
||||
|
||||
def start_birth(self):
|
||||
for month in range(1, 13):
|
||||
for day in range(1, 32):
|
||||
url = self.birth_base_url.format(month=month, day=day)
|
||||
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
|
||||
if self.debug:
|
||||
break
|
||||
|
||||
async def start(self):
|
||||
# 调用原有 start_requests 方法
|
||||
@ -167,13 +164,18 @@ class IAFDSpider(BaseSpider):
|
||||
item['href'] = ethnic_url
|
||||
yield item
|
||||
|
||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
|
||||
else:
|
||||
self.logger.warning(f"parse page error. url: {response.url}")
|
||||
|
||||
# 获得列表,查询详情
|
||||
def parse_ethnic_page(self, response):
|
||||
ethnic = response.meta['ethnic']
|
||||
depth = response.meta.get('depth', 1)
|
||||
if self.debug and depth>=3:
|
||||
self.logger.debug(f"debug mode, stop next page. ethnic:{ethnic}, url: {response.url}")
|
||||
return
|
||||
|
||||
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||
@ -181,7 +183,7 @@ class IAFDSpider(BaseSpider):
|
||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||
|
||||
if next_url:
|
||||
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic, 'depth':depth+1})
|
||||
else:
|
||||
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
||||
else:
|
||||
@ -226,7 +228,7 @@ class IAFDSpider(BaseSpider):
|
||||
list_type = response.meta.get('list_type', '')
|
||||
data, next_url = common_parser(html=response.text, page=list_type)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||
for movie in data:
|
||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||
else:
|
||||
@ -234,7 +236,9 @@ class IAFDSpider(BaseSpider):
|
||||
|
||||
# 统一处理发起影片查询的请求
|
||||
def _create_performer_request(self, href, name):
|
||||
if href != '' and is_valid_url(href):
|
||||
if href == '':
|
||||
return
|
||||
if is_valid_url(href):
|
||||
if self._can_request(href):
|
||||
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
||||
yield scrapy.Request(href,
|
||||
@ -242,20 +246,21 @@ class IAFDSpider(BaseSpider):
|
||||
meta={'name': name, 'item_type':'movie'}
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"wrong url. {url}, ignore...")
|
||||
self.logger.warning(f"wrong url. {href}, ignore...")
|
||||
|
||||
# 统一处理发起影片查询的请求
|
||||
def _create_movie_request(self, href, title):
|
||||
if href != '' and is_valid_url(href):
|
||||
if href == '':
|
||||
return
|
||||
if is_valid_url(href):
|
||||
if self.need_update_movie(href) and self._can_request(href):
|
||||
self.crawler.stats.inc_value(f"{self.name}/movie_all")
|
||||
yield scrapy.Request(href,
|
||||
callback=self.parse_movie_detail_page,
|
||||
meta={'title': title, 'item_type':'movie'},
|
||||
cache=True
|
||||
meta={'title': title, 'item_type':'movie', 'cache':True}
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"wrong url. {url}, ignore...")
|
||||
self.logger.warning(f"wrong url. {href}, ignore...")
|
||||
|
||||
# 演员详情页解析和处理
|
||||
def parse_person_detail_page(self, response):
|
||||
@ -264,6 +269,9 @@ class IAFDSpider(BaseSpider):
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||||
item = IafdPerformersItem()
|
||||
item['name'] = response.meta.get('name', '')
|
||||
item['href'] = response.url
|
||||
item['is_full_data'] = 1
|
||||
for k, v in data.items():
|
||||
if k in item.fields:
|
||||
item[k] = v
|
||||
@ -274,9 +282,9 @@ class IAFDSpider(BaseSpider):
|
||||
for role, movies in data.get('credits', {}).items():
|
||||
if movies:
|
||||
for item in movies:
|
||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||
yield from self._create_movie_request(href=item['href'], title=item['title'])
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
self._handle_invalid_response(response)
|
||||
|
||||
# 影片详情页解析和处理
|
||||
def parse_movie_detail_page(self, response):
|
||||
@ -286,6 +294,7 @@ class IAFDSpider(BaseSpider):
|
||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/movie_done")
|
||||
item = IafdMoviesItem()
|
||||
item['is_full_data'] = 1
|
||||
for k, v in data.items():
|
||||
if k in item.fields:
|
||||
item[k] = v
|
||||
@ -307,24 +316,39 @@ class IAFDSpider(BaseSpider):
|
||||
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
||||
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
self._handle_invalid_response(response)
|
||||
|
||||
# 统一判断并处理异常
|
||||
def _handle_invalid_response(self, response):
|
||||
update_flag = False
|
||||
if response.status in [200]:
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
||||
# TODO: 更新404的演员或者影片
|
||||
update_flag = True
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||
|
||||
elif response.status in [404, 403]:
|
||||
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||
# TODO: 更新404的演员或者影片
|
||||
|
||||
update_flag = True
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||
|
||||
if update_flag:
|
||||
if 'person.rme' in response.url:
|
||||
item = IafdPerformersItem()
|
||||
item['href'] = response.url
|
||||
item['name'] = response.meta.get('name', '')
|
||||
item['is_full_data'] = 404
|
||||
yield item
|
||||
elif 'title.rme' in response.url:
|
||||
item = IafdMoviesItem()
|
||||
item['href'] = response.url
|
||||
item['title'] = response.meta.get('title', '')
|
||||
item['is_full_data'] = 404
|
||||
yield item
|
||||
|
||||
def load_existed_actors(self):
|
||||
query_args = {}
|
||||
@ -366,3 +390,20 @@ class IAFDSpider(BaseSpider):
|
||||
|
||||
def acc_movie_to_existed(self, href, is_full_data=1):
|
||||
self.existed_movies[href] = is_full_data
|
||||
|
||||
def _can_request(self, href):
|
||||
if href in self.requested_url:
|
||||
return False
|
||||
|
||||
if self.debug: # 某些条件下,限定url的发起次数
|
||||
keys = ['person.rme', 'title.rme']
|
||||
for key in keys:
|
||||
count = 0
|
||||
for url in self.requested_url:
|
||||
if key.lower() in url.lower():
|
||||
count+=1
|
||||
if count >=2 and key in href.lower():
|
||||
return False
|
||||
|
||||
self.requested_url.add(href)
|
||||
return True
|
||||
@ -489,6 +489,18 @@ def extract_year_from_date_string(date_str):
|
||||
except TypeError:
|
||||
return 0
|
||||
|
||||
def dist_stu_href_rewrite(href):
|
||||
# 提取 ID(适用于 distrib 或 studio)
|
||||
import re
|
||||
match = re.search(r"(distrib|studio)=(\d+)", href)
|
||||
if not match:
|
||||
return None # 不是目标 URL,返回 None
|
||||
|
||||
key, id_number = match.groups()
|
||||
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
|
||||
return new_url
|
||||
|
||||
|
||||
# 解析网页 HTML 并提取电影信息
|
||||
def parse_page_movie(soup, href, title):
|
||||
# 解析电影基础信息
|
||||
@ -518,6 +530,12 @@ def parse_page_movie(soup, href, title):
|
||||
else:
|
||||
return None
|
||||
|
||||
if 'DistributorHref' in movie_data and 'distrib' in movie_data['DistributorHref']:
|
||||
movie_data['DistributorHref'] = dist_stu_href_rewrite(movie_data['DistributorHref'])
|
||||
|
||||
if 'StudioHref' in movie_data and 'studio' in movie_data['StudioHref']:
|
||||
movie_data['StudioHref'] = dist_stu_href_rewrite(movie_data['StudioHref'])
|
||||
|
||||
# 解析演职人员信息
|
||||
performers = []
|
||||
cast_divs = soup.find_all("div", class_="castbox")
|
||||
|
||||
Reference in New Issue
Block a user