modify scripts
This commit is contained in:
@ -161,6 +161,42 @@ def fetch_movies_by_series():
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 更新series列表中的影片信息
|
||||
def fetch_movies_by_publishers():
|
||||
if fast_mode:
|
||||
url_list = db_tools.query_publishers_hrefs(from_list=1)
|
||||
else:
|
||||
url_list = db_tools.query_publishers_hrefs()
|
||||
|
||||
if debug:
|
||||
url_list = db_tools.query_publishers_hrefs(limit=1)
|
||||
for row in url_list:
|
||||
url = row['href']
|
||||
row_id = row['id']
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for publisher url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="modal-card", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_publisher_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_publishers=1, pub_id=row_id)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
|
||||
|
||||
# 更新演员信息
|
||||
def fetch_performers_detail():
|
||||
@ -316,6 +352,7 @@ function_map = {
|
||||
"series_list": fetch_series_list,
|
||||
"makers": fetch_movies_by_maker,
|
||||
"series" : fetch_movies_by_series,
|
||||
"pub" : fetch_movies_by_publishers,
|
||||
"movies" : fetch_movies_detail,
|
||||
"actors" : fetch_performers_detail,
|
||||
}
|
||||
|
||||
@ -355,6 +355,7 @@ def parse_movie_detail(soup, href, title):
|
||||
# 获取maker,系列
|
||||
result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:'])
|
||||
result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:'])
|
||||
result['pub_name'], result['pub_link'] = parse_movie_val_href(soup, ['發行:', 'Publisher:'])
|
||||
|
||||
# 获取演员,tags
|
||||
result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:'])
|
||||
@ -523,10 +524,49 @@ def parse_maker_detail(soup, href):
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_publisher_detail(soup, href):
|
||||
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
link = row.find('a', class_='box')['href']
|
||||
serial_number = row.find('strong').text.strip()
|
||||
title = row.find('div', class_='video-title').text.strip()
|
||||
release_date = row.find('div', class_='meta').text.strip()
|
||||
list_data.append({
|
||||
'href' : host_url + link if link else '',
|
||||
'serial_number' : serial_number,
|
||||
'title' : title,
|
||||
'release_date': release_date
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = url_page_num(next_page_url)
|
||||
current_page_number = url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_uncensored(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
|
||||
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
|
||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return [], None
|
||||
|
||||
@ -64,14 +64,14 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None):
|
||||
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None, from_movie_publishers=None, pub_id=None):
|
||||
try:
|
||||
# **先检查数据库中是否已有该电影**
|
||||
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id FROM javdb_movies WHERE href = ?", (href,))
|
||||
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id FROM javdb_movies WHERE href = ?", (href,))
|
||||
existing_movie = cursor.fetchone()
|
||||
|
||||
if existing_movie: # **如果电影已存在**
|
||||
movie_id, existing_actor, existing_maker, existing_series, existing_maker_id, existing_series_id = existing_movie
|
||||
movie_id, existing_actor, existing_maker, existing_series, existing_maker_id, existing_series_id, existing_pub, existing_pub_id = existing_movie
|
||||
|
||||
# **如果没有传入值,就用原来的值**
|
||||
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor
|
||||
@ -79,6 +79,8 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None
|
||||
from_movie_series = from_movie_series if from_movie_series is not None else existing_series
|
||||
maker_id = maker_id if maker_id is not None else existing_maker_id
|
||||
series_id = series_id if series_id is not None else existing_series_id
|
||||
from_movie_publishers = from_movie_publishers if from_movie_publishers is not None else existing_pub
|
||||
pub_id = pub_id if pub_id is not None else existing_pub_id
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE javdb_movies
|
||||
@ -88,14 +90,16 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None
|
||||
from_movie_series = ?,
|
||||
maker_id = ?,
|
||||
series_id = ?,
|
||||
from_movie_publishers = ?,
|
||||
pub_id = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (title, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, href))
|
||||
""", (title, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id, href))
|
||||
else: # **如果电影不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id))
|
||||
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id))
|
||||
|
||||
conn.commit()
|
||||
|
||||
@ -165,6 +169,7 @@ def insert_or_update_actor(actor):
|
||||
movie_id = get_id_by_href('javdb_movies', movie['href'])
|
||||
# 影片不存在,先插入
|
||||
if movie_id is None:
|
||||
# TODO: from_actor_list 只标记无码女优的话,这里要修改,暂时不动
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1)
|
||||
if movie_id:
|
||||
tmp_id = insert_actor_movie(actor_id, movie_id)
|
||||
@ -470,6 +475,107 @@ def query_series_hrefs(**filters):
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# 插入或更新发行商 """
|
||||
def insert_or_update_publishers(data, caller='list'):
|
||||
try:
|
||||
if caller == 'list':
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_publishers (name, href, from_list, updated_at)
|
||||
VALUES (?, ? , 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
from_list = 1,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
elif caller == 'movie':
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_publishers (name, href, from_movie_list, updated_at)
|
||||
VALUES (?, ? , 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
from_movie_list = 1,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
else:
|
||||
logging.warning(f"unexpected caller: {caller}")
|
||||
return None
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM javdb_publishers WHERE href = ?", (data["href"],))
|
||||
dist_id = cursor.fetchone()[0]
|
||||
if dist_id:
|
||||
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||
return dist_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# 删除发行商(按 id 或 name) """
|
||||
def delete_publishers(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM javdb_publishers WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM javdb_publishers WHERE name = ?", (identifier,))
|
||||
conn.commit()
|
||||
logging.info(f"成功删除发行商: {identifier}")
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# 查询发行商(按 id 或 name) """
|
||||
def query_publishers(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM javdb_publishers WHERE id = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM javdb_publishers WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
distributor = cursor.fetchone()
|
||||
if distributor:
|
||||
return dict(zip([desc[0] for desc in cursor.description], distributor))
|
||||
else:
|
||||
logging.warning(f"未找到发行商: {identifier}")
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_publishers_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href, id FROM javdb_publishers WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "from_list" in filters:
|
||||
sql += " AND from_list = ?"
|
||||
params.append(filters["from_list"])
|
||||
if "url" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
if 'limit' in filters:
|
||||
sql += " limit ?"
|
||||
params.append(filters["limit"])
|
||||
|
||||
cursor.execute(sql, params)
|
||||
#return [row[0] for row in cursor.fetchall()] # 链接使用小写
|
||||
return [{'href': row[0], 'id': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 插入或更新类别 """
|
||||
def insert_or_update_tags(name, href):
|
||||
@ -543,17 +649,20 @@ def insert_or_update_movie(movie):
|
||||
# 获取相关 ID
|
||||
makers_id = get_id_by_href('javdb_makers', movie['maker_link']) if movie['maker_link'] else None
|
||||
series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None
|
||||
pub_id = get_id_by_href('javdb_publishers', movie['pub_link']) if movie['pub_link'] else None
|
||||
|
||||
# 如果不存在,插入
|
||||
if makers_id is None and movie['maker_link']:
|
||||
makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')}, caller='movie')
|
||||
if series_id is None and movie['series_link']:
|
||||
series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')}, caller='movie')
|
||||
if pub_id is None and movie['pub_link']:
|
||||
pub_id = insert_or_update_publishers({'name' : movie.get('pub_name', ''), 'href' : movie.get('pub_link', '')}, caller='movie')
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
|
||||
maker_id, series_id, is_full_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||
maker_id, series_id, pub_id, is_full_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
title=excluded.title,
|
||||
cover_url=excluded.cover_url,
|
||||
@ -562,10 +671,11 @@ def insert_or_update_movie(movie):
|
||||
duration=excluded.duration,
|
||||
maker_id=excluded.maker_id,
|
||||
series_id=excluded.series_id,
|
||||
pub_id=excluded.pub_id,
|
||||
is_full_data=1,
|
||||
updated_at=datetime('now', 'localtime')
|
||||
""", (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'],
|
||||
movie['release_date'], movie['duration'], makers_id, series_id))
|
||||
movie['release_date'], movie['duration'], makers_id, series_id, pub_id))
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user