modify scripts

This commit is contained in:
oscarz
2025-06-24 19:03:44 +08:00
parent c5feab2c22
commit 7e14a5f247
4 changed files with 610 additions and 226 deletions

View File

@ -1,6 +1,7 @@
import logging import logging
import sys import sys
import requests import requests
import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
import src.utils.utils as utils import src.utils.utils as utils
@ -8,6 +9,7 @@ import src.utils.utils as utils
http_code_404 = 404 http_code_404 = 404
http_code_redirect = 401 http_code_redirect = 401
http_code_url = 601 http_code_url = 601
http_code_local = 99
# 通用的爬取类,主要实现了底层的网络交互封装 # 通用的爬取类,主要实现了底层的网络交互封装
class GenericCrawler: class GenericCrawler:
@ -166,137 +168,292 @@ class JavbusCrawler(GenericCrawler):
return list_data, next_url return list_data, next_url
# 获取演员详情
def parse_actor_detail(self, soup, href): def parse_actor_detail(self, soup, href):
# 先找一下别名 """
alias_list = [] 解析Javbus网页内容提取演员信息和影片列表
"""
div_meta = soup.find('span', class_='actor-section-name') result = {
if not div_meta: 'avatar': {},
logging.warning(f'warning: no meta data found in page {href}') 'movies': []
return None, None
alias_div = soup.find('div', class_='column section-title')
if alias_div:
meta_list = alias_div.find_all('span', class_='section-meta')
if len(meta_list) > 1:
alias_list = meta_list[0].text.strip().split(", ")
# 头像
pic = ''
avatar = soup.find("div", class_="column actor-avatar")
if avatar:
pic = self.parse_avatar_image(avatar)
# 返回数据
actor = {}
# 使用正则表达式查找 class 包含 'movie-list h cols-4' 的 div 元素
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-'))
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return None, None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href': host_url + link if link else '',
'serial_number': serial_number,
'title': title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = self.url_page_num(next_page_url)
current_page_number = self.url_page_num(href)
logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number:
next_url = host_url + next_page_url
actor = {
'pic': pic,
'alias': alias_list,
'movies': list_data
} }
return actor, next_url try:
# 解析演员信息
avatar_box = soup.find('div', class_='avatar-box')
if avatar_box:
result['avatar'] = self.parse_avatar_info(avatar_box)
else:
logging.debug(f"avatar-box not found. href: {href}")
def parse_movie_one(self, soup, keys): # 解析影片列表
key_strong = soup.find('strong', string=lambda text: text in keys) movie_boxes = soup.find_all('a', class_='movie-box')
if key_strong: if movie_boxes:
key_span = key_strong.find_next_sibling('span', class_='value') for movie_box in movie_boxes:
if key_span: movie_info = self.parse_movie_info(movie_box)
return key_span.text.strip() if movie_info:
result['movies'].append(movie_info)
else:
logging.debug(f"movie-box not found. href: {href}")
except Exception as e:
logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True)
# 查找 "下一页" 按钮
next_url = None
div_link = soup.find("div", class_='text-center hidden-xs')
if div_link:
next_page_element = soup.find('a', id='next')
if next_page_element:
next_page_url = next_page_element['href']
next_url = urljoin(href, next_page_url)
return result, next_url
def parse_avatar_info(self, avatar_box):
"""
解析演员信息
"""
avatar_info = {}
# 定义映射关系:包含各种语言的字段名称及其对应的目标键名
field_mapping = {
'birth_date': ['生日', 'D.O.B', '生年月日', 'Birthday'],
'age': ['年齡', 'Age', '年龄'],
'height': ['身高', 'Height', '身長'],
'breast_size': ['罩杯', 'Cup', 'ブラのサイズ'],
'bust': ['胸圍', 'Bust', 'バスト'],
'waist': ['腰圍', 'Waist', 'ウエスト'],
'hip': ['臀圍', 'Hips', 'ヒップ'],
'hobby': ['愛好', 'Hobby', '趣味']
}
# 提取演员名称
name_span = avatar_box.find('span', class_='pb10')
if name_span:
avatar_info['name'] = name_span.get_text(strip=True)
else:
logging.debug("未找到演员名称")
# 提取生日、年龄等信息
p_tags = avatar_box.find_all('p')
for p in p_tags:
text = p.get_text(strip=True)
# 使用正则表达式匹配冒号前后的内容
match = re.search(r'^(.*?)[:](.*)$', text)
if match:
key = match.group(1).strip()
value = match.group(2).strip()
# 查找对应的目标键名
target_key = next((k for k, v in field_mapping.items() if any(x in key for x in v)), None)
if target_key:
# 特殊处理数字类型和单位转换
if target_key in ['age', 'height', 'bust', 'waist', 'hip']:
# 提取数字部分
num_match = re.search(r'(\d+\.?\d*)', value)
if num_match:
try:
avatar_info[target_key] = float(num_match.group(1))
# 保留整数(如果是整数)
if avatar_info[target_key].is_integer():
avatar_info[target_key] = int(avatar_info[target_key])
except ValueError:
logging.debug(f"转换数字失败: {value}")
avatar_info[target_key] = value
else:
logging.debug(f"未找到数字部分: {value}")
avatar_info[target_key] = value
else:
avatar_info[target_key] = value
else:
logging.debug(f"未知的演员信息类型: {key}")
else:
logging.debug(f"无法解析的演员信息: {text}")
avatar_info['measurements'] = f"{avatar_info.get('bust', '')}-{avatar_info.get('waist', '')}-{avatar_info.get('hip', '') }"
return avatar_info
def parse_movie_info(self, movie_box):
"""
解析影片信息
"""
movie_info = {}
try:
# 提取影片链接
href = movie_box.get('href')
if href:
movie_info['href'] = href
else:
logging.warning("未找到影片链接")
return None return None
def parse_movie_val_href(self, soup, keys): # 提取图片链接
key_strong = soup.find('strong', string=lambda text: text in keys) img_tag = movie_box.find('img')
if key_strong: if img_tag and 'src' in img_tag.attrs:
key_span = key_strong.find_next_sibling('span', class_='value') movie_info['cover_url'] = img_tag['src']
if key_span: movie_info['title'] = img_tag['title']
a_tag = key_span.find('a')
if a_tag:
return a_tag.text.strip(), host_url + a_tag.get('href')
else: else:
return key_span.text.strip(), None logging.warning("未找到影片图片链接")
return None, None
def parse_movie_arr(self, soup, keys): # 提取标题、番号和发布日期
key_strong = soup.find('strong', string=lambda text: text in keys) photo_info = movie_box.find('div', class_='photo-info')
if key_strong: if photo_info:
key_span = key_strong.find_next_sibling('span', class_='value') # 提取标题 (span标签中的文本排除date标签)
if key_span: span_tag = photo_info.find('span')
actors = [] if span_tag:
a_tags = key_span.find_all('a') # 获取span下的纯文本内容 (不包含date标签)
for a_tag in a_tags: title = ''.join(span_tag.find_all(text=True, recursive=False)).strip()
actors.append({ # 移除常见的分隔符模式
'name': a_tag.text.strip(), if title.endswith('\n\n /'):
'href': host_url + a_tag.get('href') clean_title = title[:-4].strip()
}) elif title.endswith('\n /'):
return actors clean_title = title[:-3].strip()
return [] else:
clean_title = title
movie_info['title'] = clean_title
# 提取番号和日期 (date标签)
date_tags = span_tag.find_all('date')
if len(date_tags) >= 2:
movie_info['serial_number'] = date_tags[0].get_text(strip=True)
movie_info['release_date'] = date_tags[1].get_text(strip=True)
else:
logging.warning(f"date标签数量不足无法提取番号和日期")
else:
logging.warning("未找到span标签")
else:
logging.warning("未找到影片信息区域")
except Exception as e:
logging.error(f"解析影片信息时发生错误: {str(e)}", exc_info=True)
return None
return movie_info
# 解析Javbus影片详情页内容
def parse_movie_detail(self, soup, href, title): def parse_movie_detail(self, soup, href, title):
div_video = soup.find("div", class_='video-meta-panel') result = {
if not div_video: 'title': title,
logging.warning(f"Warning: No movies div found ") 'href': href,
return None, None 'serial_number': '',
'release_date': '',
'duration': '',
'studio': {'name': '', 'href': ''},
'label': {'name': '', 'href': ''},
'series': {'name': '', 'href': ''},
'tags': [],
'actors': []
}
result = {} try:
result['href'] = href # 提取标题
result['title'] = title div_container = soup.find('div', class_='container')
if not div_container:
logging.warning(f"found no container tag.")
return None
# 获取封面图片 title_element = div_container.find('h3')
cover_img = soup.select_one('.column-video-cover a') if title_element:
result['cover_url'] = cover_img['href'] if cover_img else None result['title'] = title_element.get_text(strip=True)
else:
logging.debug("未找到影片标题")
# 获取番号 # 提取基本信息(识别码、发行日期等)
result['serial_number'] = self.parse_movie_one(soup, ['番號:', 'ID:']) info_div = div_container.find('div', class_='info')
result['release_date'] = self.parse_movie_one(soup, ['日期:', 'Released Date:']) if not info_div:
result['duration'] = self.parse_movie_one(soup, ['時長:', 'Duration:']) logging.warning(f"found no div info tag.")
return None
# 获取maker系列 # 定义字段映射关系(多种语言支持)
result['maker_name'], result['maker_link'] = self.parse_movie_val_href(soup, ['片商:', 'Maker:']) field_mapping = {
result['series_name'], result['series_link'] = self.parse_movie_val_href(soup, ['系列:', 'Series:']) 'serial_number': ['識別碼:', '识别码:', 'ID:', '品番:'],
result['pub_name'], result['pub_link'] = self.parse_movie_val_href(soup, ['發行:', 'Publisher:']) 'release_date': ['發行日期:', '发行日期:', 'Release Date:', '発売日:'],
'duration': ['長度:', '长度:', 'Length:', '収録時間:'],
'studio': ['製作商:', '制作商:', 'Studio:', 'メーカー:'],
'label': ['發行商:', '发行商:', 'Label:', 'レーベル:'],
'series': ['系列:', 'Series:', 'シリーズ:']
}
# 获取演员tags # 遍历所有p标签查找信息
result['tags'] = self.parse_movie_arr(soup, ['類別:', 'Tags:']) p_tags = info_div.find_all('p')
result['actors'] = self.parse_movie_arr(soup, ['演員:', 'Actor(s):']) for p in p_tags:
# 查找header标签
header = p.find('span', class_='header')
if header:
header_text = header.get_text(strip=True)
# 查找匹配的目标键名
target_key = next((k for k, v in field_mapping.items() if header_text in v), None)
if target_key:
# 获取值(处理文本和链接)
if target_key in ['studio', 'label', 'series']:
# 处理有链接的字段
a_tag = p.find('a')
if a_tag:
result[target_key]['name'] = a_tag.get_text(strip=True)
result[target_key]['href'] = a_tag.get('href', '')
else:
# 没有链接,直接获取文本
value_text = p.get_text(strip=True)
# 移除header文本
value_text = value_text.replace(header_text, '').strip()
result[target_key]['name'] = value_text
logging.debug(f"{header_text} 没有链接,直接提取文本")
else:
# 处理普通文本字段
value_text = p.get_text(strip=True)
# 移除header文本
value_text = value_text.replace(header_text, '').strip()
# 特殊处理:提取时长的数字部分(咱不处理)
if target_key == 'duration' and False:
num_match = re.search(r'(\d+)', value_text)
if num_match:
result[target_key] = num_match.group(1)
else:
result[target_key] = value_text
else:
result[target_key] = value_text
# 处理类别字段
tag_lables = info_div.find_all('label')
for item in tag_lables:
link = item.find('a')
if link:
genre = {
'name': link.get_text(strip=True),
'href': link.get('href', '')
}
result['tags'].append(genre)
# 提取演员信息
star_p = info_div.find('p', class_='star-show')
if star_p:
# 查找演员列表
star_list = star_p.find_next('ul')
if star_list:
star_items = star_list.find_all('div', class_='star-name')
for item in star_items:
link = item.find('a')
if link:
actor = {
'name': link.get_text(strip=True),
'href': link.get('href', '')
}
result['actors'].append(actor)
else:
logging.debug(f"actors not found.")
else:
logging.warning("未找到演员列表区域")
else:
logging.warning("未找到演员标题")
except Exception as e:
logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True)
return result return result
@ -527,67 +684,3 @@ class JavbusCrawler(GenericCrawler):
next_url = host_url + next_page_url next_url = host_url + next_page_url
return list_data, next_url return list_data, next_url
@staticmethod
def pretty_print_json(data, n=10, indent=4, sort_keys=False):
"""
以美化格式打印数组的前n个元素其他元素用"..."表示
参数:
- data: 要打印的数据(应为数组)
- n: 要显示的元素数量
- indent: 缩进空格数
- sort_keys: 是否按键排序
"""
try:
# 处理非数组数据
if not isinstance(data, list):
print(formatted)
return
# 复制原始数据,避免修改原数组
data_copy = data.copy()
# 切片取前n个元素
first_n_elements = data_copy[:n]
# 如果数组长度超过n添加"..."标记
if len(data) > n:
result = first_n_elements + ["... ({} more elements)".format(len(data) - n)]
else:
result = first_n_elements
# 格式化输出
formatted = json.dumps(result, indent=indent, ensure_ascii=False, sort_keys=sort_keys)
print(formatted)
except TypeError as e:
print(f"错误:无法格式化数据。详情:{e}")
except Exception as e:
print(f"打印时发生意外错误:{e}")
def test_actor_list(self, url='https://www.javbus.com/uncensored/actresses/1'):
next_url = url
all_data = []
while next_url:
print(f'fetching page {next_url}')
soup, status_code = self.fetch_page(next_url, partial(self.generic_validator, tag="div", identifier="waterfall", attr_type="id"),
max_retries=1)
if soup:
list_data, next_url = self.parse_actors_list(soup, next_url)
if list_data:
all_data.extend(list_data)
self.pretty_print_json(all_data)
else:
print('get wrong page.')
if next_url:
print(f"\n\nnext url: {next_url}")
else:
print(f"wrong request. url: {next_url}, status_code: {status_code}")
break
def url_page_num(self, url):
# 这里需要根据实际情况实现提取页码的逻辑
return None

View File

@ -118,6 +118,11 @@ class DatabaseHandler:
logging.error(f"Error inserting or updating data: {e}") logging.error(f"Error inserting or updating data: {e}")
return None return None
def get_id_by_key(self, tbl, uniq_key, val):
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
row = self.cursor.fetchone()
return row[0] if row else None
def insert_task_log(self): def insert_task_log(self):
return 1 return 1
@ -137,20 +142,62 @@ class JavbusDBHandler(DatabaseHandler):
def __init__(self, db_path=None): def __init__(self, db_path=None):
super().__init__(db_path) super().__init__(db_path)
self.tbl_name_actors = 'javbus_actors' self.tbl_name_actors = 'javbus_actors'
self.tbl_name_movies = 'javbus_movies'
self.tbl_name_studios = 'javbus_studios'
self.tbl_name_labels = 'javbus_labels'
self.tbl_name_series = 'javbus_series'
self.tbl_name_tags = 'javbus_tags'
self.tbl_name_movie_tags = 'javbus_movies_tags'
self.tbl_name_actor_movie = 'javbus_actors_movies'
def insert_actor_index(self, data, **kwargs):
fields = ['uncensored', 'from_actor_list', 'from_movie_list']
# 如果没有传入值,就用原来的值
for field in fields:
if kwargs.get(field) is not None:
data[field] = kwargs.get(field)
def insert_actor_index(self, data, uncensored=0, from_actor_list=0, from_movie_list=0):
data['uncensored'] = uncensored
if from_actor_list:
data['from_actor_list'] = from_actor_list
if from_movie_list:
data['from_movie_list'] = from_movie_list
try: try:
return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href') return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href')
except sqlite3.Error as e: except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}") logging.error(f"Error inserting or updating data: {e}")
return None return None
def update_actor_detail(self, data, is_full_data=1): def insert_movie_index(self, data, **kwargs):
fields = ['uncensored', 'from_actor_list', 'from_movie_studios', 'from_movie_labels', 'from_movie_series']
# 如果没有传入值,就用原来的值
for field in fields:
if kwargs.get(field) is not None:
data[field] = kwargs.get(field)
try:
return self.insert_or_update_common(data, self.tbl_name_movies, uniq_key='href')
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# 插入演员和电影的关联数据
def insert_actor_movie(self, performer_id, movie_id, tags=''):
try:
self.cursor.execute("""
INSERT INTO javbus_actors_movies (actor_id, movie_id, tags, updated_at)
VALUES (?, ?, ?, datetime('now', 'localtime'))
ON CONFLICT(actor_id, movie_id) DO UPDATE SET tags=excluded.tags, updated_at=datetime('now', 'localtime')
""",
(performer_id, movie_id, tags)
)
self.conn.commit()
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
return performer_id
except Exception as e:
self.conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
def update_actor_detail_404(self, data, is_full_data=1):
try: try:
data['is_full_data'] = is_full_data data['is_full_data'] = is_full_data
return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href') return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href')
@ -158,17 +205,49 @@ class JavbusDBHandler(DatabaseHandler):
logging.error(f"Error inserting or updating data: {e}") logging.error(f"Error inserting or updating data: {e}")
return None return None
def update_actor_detail(self, data, is_full_data=1):
try:
# 跟新actor表
if data.get('avatar') is not None:
avatar = data.get('avatar', {})
avatar['href'] = data['href']
avatar['is_full_data'] = is_full_data
avatar_id = self.insert_or_update_common(avatar, self.tbl_name_actors, uniq_key='href')
logging.debug(f"update actor data. data: {avatar}")
else:
avatar_id = self.get_id_by_key(self.tbl_name_actors, 'href', data.get('href', ''))
if not avatar_id:
logging.warning(f"get actor id error. href: {data['href']}")
return None
# 更新movies表
uncensored = data.get('uncensored', 0)
for movie in data.get('credits', []):
movie_id = self.insert_movie_index(movie, from_actor_list=1, uncensored=uncensored)
if movie_id:
logging.debug(f"insert one movie index. data: {movie}")
# 插入关系表
link_id = self.insert_actor_movie(avatar_id, movie_id)
if link_id:
logging.debug(f"insert one actor_movie record. actor id: {avatar_id}, movie id: {movie_id}")
return avatar_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
def query_actors(self, **filters): def query_actors(self, **filters):
try: try:
sql = f"SELECT url, en_name as name FROM {self.tbl_name_actors} WHERE 1=1" sql = f"SELECT href, en_name as name, uncensored FROM {self.tbl_name_actors} WHERE 1=1"
params = [] params = []
conditions = { conditions = {
"id": " AND id = ?", "id": " AND id = ?",
"url": " AND href = ?", "href": " AND href = ?",
"en_name": " AND name LIKE ?", "en_name": " AND en_name LIKE ?",
"is_full_data": " AND is_full_data = ?", "is_full_data": " AND is_full_data = ?",
"start_id": " AND id > ?", "start_id": " AND id > ?",
"uncensored": " AND uncensored = ?",
} }
for key, condition in conditions.items(): for key, condition in conditions.items():
@ -197,8 +276,157 @@ class JavbusDBHandler(DatabaseHandler):
params.append(filters["limit"]) params.append(filters["limit"])
self.cursor.execute(sql, params) self.cursor.execute(sql, params)
return [{'url': row[0], 'name': row[1]} for row in self.cursor.fetchall()] return [{'href': row[0], 'name': row[1], 'uncensored': row[2]} for row in self.cursor.fetchall()]
except sqlite3.Error as e: except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}") logging.error(f"查询 href 失败: {e}")
return None return None
def query_movies(self, **filters):
try:
sql = f"SELECT href, title, uncensored, id FROM {self.tbl_name_movies} WHERE 1=1"
params = []
conditions = {
"id": " AND id = ?",
"href": " AND href = ?",
"title": " AND title LIKE ?",
"is_full_data": " AND is_full_data = ?",
"start_id": " AND id > ?",
"uncensored": " AND uncensored = ?",
}
for key, condition in conditions.items():
if key in filters:
sql += condition
if key == "title":
params.append(f"%{filters[key]}%")
else:
params.append(filters[key])
for key in ["is_full_data_in", "is_full_data_not_in"]:
if key in filters:
values = filters[key]
if values:
placeholders = ", ".join(["?"] * len(values))
operator = "IN" if key == "is_full_data_in" else "NOT IN"
sql += f" AND is_full_data {operator} ({placeholders})"
params.extend(values)
if "order_by" in filters:
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
sql += f" ORDER BY {filters['order_by']} "
if 'limit' in filters:
sql += " LIMIT ?"
params.append(filters["limit"])
self.cursor.execute(sql, params)
return [{'href': row[0], 'title': row[1], 'uncensored': row[2], 'id':row[3]} for row in self.cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 检查记录是否存在,不存在就插入
def check_and_get_id(self, item, uncensored, tbl, uniq_key='href'):
name = item['name']
href = item['href']
row_id = self.get_id_by_key(tbl, uniq_key, href)
if row_id is None:
row_id = self.insert_or_update_common({'name':name, 'href': href, 'uncensored':uncensored, 'from_movie_list':1}, tbl_name=tbl, uniq_key=uniq_key)
return row_id
def insert_or_update_tags(self, data, uniq_key='href'):
return self.insert_or_update_common(data, self.tbl_name_tags, uniq_key)
def insert_movie_tags(self, movie_id, tag_id, tags):
try:
self.cursor.execute("""
INSERT INTO javbus_movies_tags (movie_id, tag_id, tags, updated_at)
VALUES (?, ?, ?, datetime('now', 'localtime'))
ON CONFLICT(tag_id, movie_id) DO UPDATE SET tags=excluded.tags, updated_at=datetime('now', 'localtime')
""",
(movie_id, tag_id, tags)
)
self.conn.commit()
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
return movie_id
except Exception as e:
self.conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
def insert_or_update_movie_404(self, data, is_full_data=1):
try:
data['is_full_data'] = is_full_data
return self.insert_or_update_common(data, self.tbl_name_movies, uniq_key='href')
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# """插入或更新电影数据"""
def insert_or_update_movie(self, movie, is_full_data=1):
try:
# 获取相关 ID
studio_id = self.check_and_get_id(movie.get('studio'), movie.get('uncensored', 0), self.tbl_name_studios) if movie.get('studio') is not None else None
label_id = self.check_and_get_id(movie.get('label'), movie.get('uncensored', 0), self.tbl_name_labels) if movie.get('label') is not None else None
series_id = self.check_and_get_id(movie.get('series'), movie.get('uncensored', 0), self.tbl_name_series) if movie.get('series') is not None else None
if studio_id:
movie['studio_id'] = studio_id
if label_id:
movie['label_id'] = label_id
if series_id:
movie['series_id'] = series_id
movie['is_full_data'] = is_full_data
movie['actors_cnt'] = len(movie.get('actors', []))
movie_id = self.insert_or_update_common(movie, self.tbl_name_movies, uniq_key='href')
if movie_id is None:
logging.warning(f"insert/update movie error. data:{movie}")
return None
logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}")
# 插入 performers_movies 关系表
for performer in movie.get('actors', []):
performer_id = self.get_id_by_key(self.tbl_name_actors, 'href', performer['href'])
# 如果演员不存在,先插入
if performer_id is None:
performer_id = self.insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}")
if performer_id:
tmp_id = self.insert_actor_movie(performer_id, movie_id)
if tmp_id:
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
else:
logging.debug(f"insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}")
else:
logging.warning(f"insert perfomer failed. name: {performer['name']}, href: {performer['href']}")
# 插入 tags 表
for tag in movie.get('tags', []):
tag_name = tag.get('name', '')
tag_href = tag.get('href', '')
tag_id = self.insert_or_update_tags({'name':tag_name, 'href':tag_href}, uniq_key='href')
if tag_id:
logging.debug(f"insert one tags. tag_id: {tag_id}, name: {tag_name}")
tmp_id = self.insert_movie_tags(movie_id=movie_id, tag_id=tag_id, tags=tag_name)
if tmp_id:
logging.debug(f"insert one movie_tag. movie_id: {movie_id}, tag_id: {tag_id}, name: {tag_name}")
else:
logging.warning(f"insert one movie_tag error. movie_id: {movie_id}, tag_id: {tag_id}, name: {tag_name}")
else:
logging.warning(f"insert tags error. name:{tag_name}, href: {tag_href}")
return movie_id
except Exception as e:
self.conn.rollback()
logging.error("Error inserting movie: %s", e)
return None

View File

@ -246,13 +246,13 @@ def fetch_performers_detail():
limit_count = 5 if debug else 100 limit_count = 5 if debug else 100
performers_list = [] performers_list = []
last_performer_id = 0 last_performer_id = 0
abnormal_codes = [scraper.http_code_404, scraper.http_code_login] abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
def get_performers(**kwargs): def get_performers(**kwargs):
if scan_mode == 1: if scan_mode == 1:
kwargs["from_actor_list"] = 1 kwargs["uncensored"] = 1
elif scan_mode == 0: elif scan_mode == 0:
kwargs["from_actor_list"] = 0 kwargs["uncensored"] = 0
else: else:
logging.debug(f"scan all records") logging.debug(f"scan all records")
kwargs["order_by"] = 'id asc' kwargs["order_by"] = 'id asc'
@ -278,29 +278,29 @@ def fetch_performers_detail():
for performer in performers_list: for performer in performers_list:
url = performer['href'] url = performer['href']
person = performer['name'] person = performer['name']
pic = '' uncensored = int(performer['uncensored'])
alias = [] avatar = None
next_url = url next_url = url
all_movies = [] all_movies = []
need_insert = True need_insert = True
while next_url: while next_url:
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...") logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class")) soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="alert alert-success alert-common", attr_type="class"))
if soup: if soup:
data, next_url = scraper.parse_actor_detail(soup, next_url) data, next_url = scraper.parse_actor_detail(soup, next_url)
if data: if data:
pic = data.get('pic', '') if not avatar:
alias = data.get('alias', []) avatar = data.get('avatar')
all_movies.extend(data.get('movies', [])) all_movies.extend(data.get('movies', []))
elif status_code and status_code == scraper.http_code_404: elif status_code and status_code == craw.http_code_404:
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404) actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404})
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...') logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False need_insert = False
break break
elif status_code and status_code == scraper.http_code_login: elif status_code and status_code == craw.http_code_redirect:
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_login) actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_redirect})
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...') logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False need_insert = False
break break
@ -311,16 +311,20 @@ def fetch_performers_detail():
if not need_insert: if not need_insert:
continue continue
#utils.pretty_print_json(avatar)
#utils.pretty_print_json(all_movies)
#continue
# 获取完了个人的所有影片,开始插入数据 # 获取完了个人的所有影片,开始插入数据
performer_id = db_tools.insert_or_update_actor({ performer_id = db_tools.update_actor_detail({
'href': url, 'href': url,
'name': person, 'name': person,
'pic' : pic, 'avatar': avatar,
'alias' : alias, 'credits':all_movies,
'credits':all_movies 'uncensored':uncensored
}) })
if performer_id: if performer_id:
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') logging.debug(f'insert/update one person, id: {performer_id}, person: ({person}), url: {url}')
last_performer_id = performer_id last_performer_id = performer_id
succ_rows += 1 succ_rows += 1
else: else:
@ -334,10 +338,10 @@ def fetch_performers_detail():
# 更新影片信息 # 更新影片信息
def fetch_movies_detail(): def fetch_movies_detail():
limit_count = 10 if debug else 100 limit_count = 2 if debug else 100
movies_list = [] movies_list = []
last_movie_id = 0 last_movie_id = 0
abnormal_codes = [scraper.http_code_404, scraper.http_code_login] abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
def get_movies(**kwargs): def get_movies(**kwargs):
if scan_mode == 1: if scan_mode == 1:
@ -347,7 +351,7 @@ def fetch_movies_detail():
else: else:
logging.debug(f"scan all records.") logging.debug(f"scan all records.")
kwargs["order_by"] = 'id asc' kwargs["order_by"] = 'id asc'
return db_tools.query_movie_hrefs(limit=limit_count, **kwargs) return db_tools.query_movies(limit=limit_count, **kwargs)
while True: while True:
if update_mode == 0: # 只遍历新纪录 if update_mode == 0: # 只遍历新纪录
@ -370,10 +374,11 @@ def fetch_movies_detail():
url = movie['href'] url = movie['href']
title = movie['title'] title = movie['title']
curr_id = movie['id'] curr_id = movie['id']
uncensored = int(movie['uncensored'])
logging.debug(f"Fetching data for movie ({title}), url {url} ...") logging.debug(f"Fetching data for movie ({title}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class")) soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="container", attr_type="class"))
# 从本地读取的文件,忽略 # 从本地读取的文件,忽略
if skip_local and status_code == scraper.http_code_local : if skip_local and status_code == craw.http_code_local :
last_movie_id = curr_id last_movie_id = curr_id
succ_count += 1 succ_count += 1
continue continue
@ -381,6 +386,9 @@ def fetch_movies_detail():
if soup: if soup:
movie_data = scraper.parse_movie_detail(soup, url, title) movie_data = scraper.parse_movie_detail(soup, url, title)
if movie_data : if movie_data :
#utils.pretty_print_json(movie_data)
#continue
movie_data['uncensored'] = uncensored
movie_id = db_tools.insert_or_update_movie(movie_data) movie_id = db_tools.insert_or_update_movie(movie_data)
if movie_id: if movie_id:
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}') logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
@ -391,11 +399,11 @@ def fetch_movies_detail():
else: else:
logging.warning(f'parse_page_movie error. url: {url}') logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == scraper.http_code_404: elif status_code and status_code == craw.http_code_404:
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404) movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
elif status_code and status_code == scraper.http_code_login: elif status_code and status_code == craw.http_code_redirect:
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login) movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_redirect})
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...') logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else: else:
logging.warning(f'fetch_page error. url: {url}') logging.warning(f'fetch_page error. url: {url}')

View File

@ -165,3 +165,58 @@ def normalize_url(url: str) -> str:
except Exception as e: except Exception as e:
print(f"URL标准化失败: {url}, 错误: {e}") print(f"URL标准化失败: {url}, 错误: {e}")
return url # 出错时返回原始URL return url # 出错时返回原始URL
import json
def pretty_print_json(data, n=10, indent=4, sort_keys=False):
"""
以美化格式打印数组的前n个元素其他元素用"..."表示
参数:
- data: 要打印的数据(应为数组)
- n: 要显示的元素数量
- indent: 缩进空格数
- sort_keys: 是否按键排序
"""
try:
# 处理非数组数据
if not isinstance(data, list):
formatted = json.dumps(data, indent=indent, ensure_ascii=False, sort_keys=sort_keys)
print(formatted)
return
# 复制原始数据,避免修改原数组
data_copy = data.copy()
# 切片取前n个元素
first_n_elements = data_copy[:n]
# 如果数组长度超过n添加"..."标记
if len(data) > n:
result = first_n_elements + ["... ({} more elements)".format(len(data) - n)]
else:
result = first_n_elements
# 格式化输出
formatted = json.dumps(result, indent=indent, ensure_ascii=False, sort_keys=sort_keys)
print(formatted)
except TypeError as e:
print(f"错误:无法格式化数据。详情:{e}")
except Exception as e:
print(f"打印时发生意外错误:{e}")
# 使用示例
if __name__ == "__main__":
# 长数组示例20个元素
large_array = [{"id": i, "value": f"元素{i}"} for i in range(1, 21)]
print("### 输出前3个元素")
pretty_print_json(large_array, n=3)
print("\n### 输出前10个元素")
pretty_print_json(large_array, n=10)
# 非数组数据示例
print("\n### 非数组数据(字典):")
pretty_print_json({"key1": "value1", "key2": "value2"})