modify scripts
This commit is contained in:
@ -244,6 +244,7 @@ def fetch_performers_detail_once(perfomers_list):
|
|||||||
url = performer['href']
|
url = performer['href']
|
||||||
person = performer['name']
|
person = performer['name']
|
||||||
curr_id = performer['id']
|
curr_id = performer['id']
|
||||||
|
movies_cnt = performer['movies_cnt']
|
||||||
logging.debug(f"Fetching data for performer ({person}), url {url} ...")
|
logging.debug(f"Fetching data for performer ({person}), url {url} ...")
|
||||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||||
# 从本地读取的文件,忽略
|
# 从本地读取的文件,忽略
|
||||||
@ -253,6 +254,14 @@ def fetch_performers_detail_once(perfomers_list):
|
|||||||
if soup:
|
if soup:
|
||||||
data = scraper.parse_page_performer(soup, url)
|
data = scraper.parse_page_performer(soup, url)
|
||||||
if data:
|
if data:
|
||||||
|
# 检查影片数量是否有更新
|
||||||
|
page_movies_cnt = int(data.get('movies_cnt', '0'))
|
||||||
|
if page_movies_cnt <= movies_cnt:
|
||||||
|
if not force:
|
||||||
|
logging.info(f"actor already update. skipping... person: ({person}), url: {url}")
|
||||||
|
last_performer_id = curr_id
|
||||||
|
continue
|
||||||
|
|
||||||
performer_id = db_tools.insert_or_update_performer({
|
performer_id = db_tools.insert_or_update_performer({
|
||||||
'href': url,
|
'href': url,
|
||||||
'person': person,
|
'person': person,
|
||||||
@ -377,6 +386,8 @@ def fetch_movies_detail():
|
|||||||
if debug:
|
if debug:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def reset_actor_movie_cnt():
|
||||||
|
db_tools.reset_actor_movies()
|
||||||
|
|
||||||
# 建立缩写到函数的映射
|
# 建立缩写到函数的映射
|
||||||
function_map = {
|
function_map = {
|
||||||
@ -387,6 +398,7 @@ function_map = {
|
|||||||
"stu" : fetch_movies_by_stu,
|
"stu" : fetch_movies_by_stu,
|
||||||
"performers": fetch_performers_detail,
|
"performers": fetch_performers_detail,
|
||||||
"movies" : fetch_movies_detail,
|
"movies" : fetch_movies_detail,
|
||||||
|
"reset_mv" : reset_actor_movie_cnt,
|
||||||
}
|
}
|
||||||
|
|
||||||
# 主函数
|
# 主函数
|
||||||
|
|||||||
@ -329,7 +329,7 @@ def query_performer(identifier):
|
|||||||
# 按条件查询 href 列表
|
# 按条件查询 href 列表
|
||||||
def query_performer_hrefs(**filters):
|
def query_performer_hrefs(**filters):
|
||||||
try:
|
try:
|
||||||
sql = "SELECT href, name, id FROM iafd_performers WHERE 1=1"
|
sql = "SELECT href, name, id, movies_cnt FROM iafd_performers WHERE 1=1"
|
||||||
params = []
|
params = []
|
||||||
|
|
||||||
if "id" in filters:
|
if "id" in filters:
|
||||||
@ -375,7 +375,7 @@ def query_performer_hrefs(**filters):
|
|||||||
logging.debug(f"query sql: {sql}")
|
logging.debug(f"query sql: {sql}")
|
||||||
cursor.execute(sql, params)
|
cursor.execute(sql, params)
|
||||||
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||||
return [{'href': row[0], 'name': row[1], 'id':row[2]} for row in cursor.fetchall()]
|
return [{'href': row[0], 'name': row[1], 'id':row[2], 'movies_cnt':row[3]} for row in cursor.fetchall()]
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logging.error(f"查询 href 失败: {e}")
|
logging.error(f"查询 href 失败: {e}")
|
||||||
@ -905,6 +905,62 @@ def check_and_create_stat_table(taskid = 0):
|
|||||||
logging.warning(f"An error occurred: {e}")
|
logging.warning(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# 处理影片的 无码 字段
|
||||||
|
def reset_actor_movies(check_and_do = 0):
|
||||||
|
try:
|
||||||
|
# 检查表中是否已存在movies_cnt列
|
||||||
|
cursor.execute(f"PRAGMA table_info(iafd_performers);")
|
||||||
|
columns = [row[1] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
if 'movies_cnt' not in columns:
|
||||||
|
# 列不存在,添加新列
|
||||||
|
add_field_sql = f"""
|
||||||
|
ALTER TABLE iafd_performers ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
|
||||||
|
"""
|
||||||
|
cursor.execute(add_field_sql)
|
||||||
|
logging.info("成功添加movies_cnt字段")
|
||||||
|
else:
|
||||||
|
logging.info("movies_cnt字段已存在,跳过添加")
|
||||||
|
|
||||||
|
# 确保关联表有索引
|
||||||
|
cursor.execute(f"""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_iafd_performers_movies_performer_id
|
||||||
|
ON iafd_performers_movies(performer_id);
|
||||||
|
""")
|
||||||
|
|
||||||
|
# 创建临时表存储统计结果
|
||||||
|
cursor.execute(f"""
|
||||||
|
CREATE TEMPORARY TABLE temp_actor_counts AS
|
||||||
|
SELECT performer_id, COUNT(movie_id) AS cnt
|
||||||
|
FROM iafd_performers_movies
|
||||||
|
GROUP BY performer_id;
|
||||||
|
""")
|
||||||
|
|
||||||
|
# 为临时表添加索引
|
||||||
|
cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);")
|
||||||
|
|
||||||
|
# 更新主表
|
||||||
|
cursor.execute(f"""
|
||||||
|
UPDATE iafd_performers
|
||||||
|
SET movies_cnt = COALESCE((
|
||||||
|
SELECT cnt FROM temp_actor_counts
|
||||||
|
WHERE performer_id = iafd_performers.id
|
||||||
|
), 0); -- 使用COALESCE处理没有影片的演员
|
||||||
|
""")
|
||||||
|
|
||||||
|
updated_rows = cursor.rowcount
|
||||||
|
logging.info(f"成功更新{updated_rows}个演员的影片数量")
|
||||||
|
|
||||||
|
# 清理资源
|
||||||
|
cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
logging.info("任务执行完成!")
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error updating actor movie_cnt: %s", e)
|
||||||
|
|
||||||
|
|
||||||
# 插入一条任务日志
|
# 插入一条任务日志
|
||||||
def insert_task_log():
|
def insert_task_log():
|
||||||
|
|||||||
@ -178,6 +178,7 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
"""
|
"""
|
||||||
result = {
|
result = {
|
||||||
'avatar': {},
|
'avatar': {},
|
||||||
|
'title' : {},
|
||||||
'movies': []
|
'movies': []
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -189,6 +190,9 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
else:
|
else:
|
||||||
logging.debug(f"avatar-box not found. href: {href}")
|
logging.debug(f"avatar-box not found. href: {href}")
|
||||||
|
|
||||||
|
# 解析页面上的title,获取影片数量等信息
|
||||||
|
result['title'] = self.parse_title_info(soup, href)
|
||||||
|
|
||||||
# 解析影片列表
|
# 解析影片列表
|
||||||
movie_boxes = soup.find_all('a', class_='movie-box')
|
movie_boxes = soup.find_all('a', class_='movie-box')
|
||||||
if movie_boxes:
|
if movie_boxes:
|
||||||
@ -336,17 +340,9 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
|
|
||||||
return movie_info
|
return movie_info
|
||||||
|
|
||||||
|
# 获取页面头部的信息
|
||||||
# 获取演员详情
|
def parse_title_info(self, soup, href):
|
||||||
def parse_studios_labels_series_detail(self, soup, href):
|
title_info = {}
|
||||||
"""
|
|
||||||
解析Javbus网页内容,提取演员信息和影片列表
|
|
||||||
"""
|
|
||||||
result = {
|
|
||||||
'meta': {},
|
|
||||||
'movies': []
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 解析标题
|
# 解析标题
|
||||||
b_tag = soup.select_one('.alert.alert-success.alert-common p b')
|
b_tag = soup.select_one('.alert.alert-success.alert-common p b')
|
||||||
@ -367,8 +363,8 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
# 提取前两个元素作为工作室和角色
|
# 提取前两个元素作为工作室和角色
|
||||||
studio = parts[video_index - 2]
|
studio = parts[video_index - 2]
|
||||||
role = parts[video_index - 1]
|
role = parts[video_index - 1]
|
||||||
result['meta']['title'] = studio
|
title_info['title'] = studio
|
||||||
result['meta']['role'] = role
|
title_info['role'] = role
|
||||||
else:
|
else:
|
||||||
logging.debug(f"无法按规则解析: {' - '.join(parts)}")
|
logging.debug(f"无法按规则解析: {' - '.join(parts)}")
|
||||||
|
|
||||||
@ -384,13 +380,31 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
if '全部影片' in text:
|
if '全部影片' in text:
|
||||||
match = re.search(r'全部影片\s*(\d+)\s*', text)
|
match = re.search(r'全部影片\s*(\d+)\s*', text)
|
||||||
if match:
|
if match:
|
||||||
result['meta']['movies_cnt'] = int(match.group(1))
|
title_info['movies_cnt'] = int(match.group(1))
|
||||||
|
|
||||||
# 提取已有磁力数量
|
# 提取已有磁力数量
|
||||||
if '已有磁力' in text:
|
if '已有磁力' in text:
|
||||||
match = re.search(r'已有磁力\s*(\d+)\s*', text)
|
match = re.search(r'已有磁力\s*(\d+)\s*', text)
|
||||||
if match:
|
if match:
|
||||||
result['meta']['magnet_cnt'] = int(match.group(1))
|
title_info['magnet_cnt'] = int(match.group(1))
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True)
|
||||||
|
|
||||||
|
return title_info
|
||||||
|
|
||||||
|
# 获取演员详情
|
||||||
|
def parse_studios_labels_series_detail(self, soup, href):
|
||||||
|
"""
|
||||||
|
解析Javbus网页内容,提取演员信息和影片列表
|
||||||
|
"""
|
||||||
|
result = {
|
||||||
|
'meta': {},
|
||||||
|
'movies': []
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 解析标题
|
||||||
|
result['meta'] = self.parse_title_info(soup, href)
|
||||||
|
|
||||||
div_waterfall = soup.find('div', id='waterfall')
|
div_waterfall = soup.find('div', id='waterfall')
|
||||||
if not div_waterfall:
|
if not div_waterfall:
|
||||||
|
|||||||
@ -281,7 +281,7 @@ class JavbusDBHandler(DatabaseHandler):
|
|||||||
|
|
||||||
def query_actors(self, **filters):
|
def query_actors(self, **filters):
|
||||||
try:
|
try:
|
||||||
sql = f"SELECT href, en_name as name, uncensored FROM {self.tbl_name_actors} WHERE 1=1"
|
sql = f"SELECT href, en_name as name, uncensored, movies_cnt FROM {self.tbl_name_actors} WHERE 1=1"
|
||||||
params = []
|
params = []
|
||||||
|
|
||||||
conditions = {
|
conditions = {
|
||||||
@ -319,7 +319,7 @@ class JavbusDBHandler(DatabaseHandler):
|
|||||||
params.append(filters["limit"])
|
params.append(filters["limit"])
|
||||||
|
|
||||||
self.cursor.execute(sql, params)
|
self.cursor.execute(sql, params)
|
||||||
return [{'href': row[0], 'name': row[1], 'uncensored': row[2]} for row in self.cursor.fetchall()]
|
return [{'href': row[0], 'name': row[1], 'uncensored': row[2], 'movies_cnt':row[3]} for row in self.cursor.fetchall()]
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logging.error(f"查询 href 失败: {e}")
|
logging.error(f"查询 href 失败: {e}")
|
||||||
return None
|
return None
|
||||||
@ -694,3 +694,55 @@ class JavbusDBHandler(DatabaseHandler):
|
|||||||
self.conn.rollback()
|
self.conn.rollback()
|
||||||
logging.error("Error inserting movie: %s", e)
|
logging.error("Error inserting movie: %s", e)
|
||||||
logging.error(f"query error: {e}")
|
logging.error(f"query error: {e}")
|
||||||
|
|
||||||
|
# 处理影片的 无码 字段
|
||||||
|
def reset_actor_movies(self, check_and_do = 0):
|
||||||
|
try:
|
||||||
|
# 检查表中是否已存在movies_cnt列
|
||||||
|
self.cursor.execute(f"PRAGMA table_info({self.tbl_name_actors});")
|
||||||
|
columns = [row[1] for row in self.cursor.fetchall()]
|
||||||
|
|
||||||
|
if 'movies_cnt' not in columns:
|
||||||
|
# 列不存在,添加新列
|
||||||
|
add_field_sql = f"""
|
||||||
|
ALTER TABLE {self.tbl_name_actors} ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
|
||||||
|
"""
|
||||||
|
self.cursor.execute(add_field_sql)
|
||||||
|
logging.info("成功添加movies_cnt字段")
|
||||||
|
else:
|
||||||
|
logging.info("movies_cnt字段已存在,跳过添加")
|
||||||
|
|
||||||
|
# 确保关联表有索引
|
||||||
|
self.cursor.execute(f"""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_actor_movie_actor_id
|
||||||
|
ON {self.tbl_name_actor_movie}(actor_id);
|
||||||
|
""")
|
||||||
|
|
||||||
|
# 创建临时表存储统计结果
|
||||||
|
self.cursor.execute(f"""
|
||||||
|
CREATE TEMPORARY TABLE temp_actor_counts AS
|
||||||
|
SELECT actor_id, COUNT(movie_id) AS cnt
|
||||||
|
FROM {self.tbl_name_actor_movie}
|
||||||
|
GROUP BY actor_id;
|
||||||
|
""")
|
||||||
|
|
||||||
|
# 为临时表添加索引
|
||||||
|
self.cursor.execute("CREATE INDEX idx_temp_actor_id ON temp_actor_counts(actor_id);")
|
||||||
|
|
||||||
|
# 更新主表
|
||||||
|
self.cursor.execute(f"""
|
||||||
|
UPDATE {self.tbl_name_actors}
|
||||||
|
SET movies_cnt = COALESCE((
|
||||||
|
SELECT cnt FROM temp_actor_counts
|
||||||
|
WHERE actor_id = {self.tbl_name_actors}.id
|
||||||
|
), 0); -- 使用COALESCE处理没有影片的演员
|
||||||
|
""")
|
||||||
|
updated_rows = self.cursor.rowcount
|
||||||
|
logging.info(f"成功更新{updated_rows}个演员的影片数量")
|
||||||
|
|
||||||
|
self.conn.commit()
|
||||||
|
logging.info("任务执行完成!")
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
self.conn.rollback()
|
||||||
|
logging.error("Error updating actor movie_cnt: %s", e)
|
||||||
|
|||||||
@ -18,6 +18,7 @@ scraper = craw.JavbusCrawler()
|
|||||||
|
|
||||||
debug = False
|
debug = False
|
||||||
skip_local = False
|
skip_local = False
|
||||||
|
g_force = False
|
||||||
g_uncensored = 0
|
g_uncensored = 0
|
||||||
update_mode = 0
|
update_mode = 0
|
||||||
|
|
||||||
@ -313,6 +314,7 @@ def fetch_performers_detail():
|
|||||||
url = performer['href']
|
url = performer['href']
|
||||||
person = performer['name']
|
person = performer['name']
|
||||||
uncensored = int(performer['uncensored'])
|
uncensored = int(performer['uncensored'])
|
||||||
|
movies_cnt = int(performer['movies_cnt'])
|
||||||
avatar = None
|
avatar = None
|
||||||
if not utils.is_valid_url(url):
|
if not utils.is_valid_url(url):
|
||||||
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404})
|
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404})
|
||||||
@ -328,8 +330,18 @@ def fetch_performers_detail():
|
|||||||
if soup:
|
if soup:
|
||||||
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
||||||
if data:
|
if data:
|
||||||
|
# 首页,获取影片总数量,如果与数据库中相同,则跳过
|
||||||
if not avatar:
|
if not avatar:
|
||||||
avatar = data.get('avatar')
|
avatar = data.get('avatar')
|
||||||
|
page_title = data.get('title', {})
|
||||||
|
page_movies_cnt = int(page_title.get('movies_cnt', '0'))
|
||||||
|
if page_movies_cnt <= movies_cnt:
|
||||||
|
# 相当于已经全部获取过了,没有要更新的。
|
||||||
|
need_insert = False if not g_force else True
|
||||||
|
logging.info(f"actor already update. insert flag: {need_insert}. person: ({person}), url: {url}")
|
||||||
|
if not need_insert:
|
||||||
|
break
|
||||||
|
|
||||||
all_movies.extend(data.get('movies', []))
|
all_movies.extend(data.get('movies', []))
|
||||||
else:
|
else:
|
||||||
logging.warning(f"fetch_page error. url: {url}")
|
logging.warning(f"fetch_page error. url: {url}")
|
||||||
@ -455,6 +467,7 @@ def fetch_movies_detail():
|
|||||||
# 重置 movies 表的 uncensored 标志位
|
# 重置 movies 表的 uncensored 标志位
|
||||||
def reset_movies_uncensored():
|
def reset_movies_uncensored():
|
||||||
db_tools.reset_movies_uncensored(check_and_do=0 if debug else 1)
|
db_tools.reset_movies_uncensored(check_and_do=0 if debug else 1)
|
||||||
|
db_tools.reset_actor_movies()
|
||||||
|
|
||||||
# 建立缩写到函数的映射
|
# 建立缩写到函数的映射
|
||||||
function_map = {
|
function_map = {
|
||||||
@ -523,6 +536,9 @@ def set_env(args):
|
|||||||
if args.update:
|
if args.update:
|
||||||
update_mode = args.update
|
update_mode = args.update
|
||||||
|
|
||||||
|
global g_force
|
||||||
|
g_force = args.force
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 命令行参数处理
|
# 命令行参数处理
|
||||||
keys_str = ",".join(function_map.keys())
|
keys_str = ",".join(function_map.keys())
|
||||||
@ -548,6 +564,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
|
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
|
||||||
parser.add_argument('--uncensored', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
|
parser.add_argument('--uncensored', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
|
||||||
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
|
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
|
||||||
|
parser.add_argument('--force', action='store_true', help='Enable force update mode (limit records)')
|
||||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user