modify scripts
This commit is contained in:
@ -8,6 +8,7 @@ export PATH="/home/ubuntu/.local/bin:$PATH"
|
||||
|
||||
# 项目基础路径
|
||||
SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
|
||||
GIT_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../" && pwd)
|
||||
LOG_DIR="${SCRAPY_PROJ_DIR}/log"
|
||||
mkdir -p "${LOG_DIR}" # 确保日志目录存在(锁文件依赖此目录)
|
||||
SLEEP_SECONDS=60
|
||||
@ -62,6 +63,34 @@ release_lock() {
|
||||
trap release_lock EXIT
|
||||
|
||||
|
||||
# ==============================================
|
||||
# Git操作:拉取最新代码
|
||||
# ==============================================
|
||||
# 功能:执行git pull并检查结果
|
||||
# 参数:1. 项目目录 2. 日志函数(可选)
|
||||
# 返回值:0=成功,1=失败
|
||||
git_pull() {
|
||||
local repo_dir="$1"
|
||||
local log_func="${2:-echo}" # 允许传入日志函数
|
||||
|
||||
if [ ! -d "${repo_dir}/.git" ]; then
|
||||
$log_func "ERROR: 目录${repo_dir}不是Git仓库,无法执行git pull"
|
||||
return 1
|
||||
fi
|
||||
|
||||
$log_func "开始执行git pull更新代码..."
|
||||
local pull_output=$(cd "${repo_dir}" && git pull 2>&1)
|
||||
local exit_code=$?
|
||||
|
||||
if [ ${exit_code} -eq 0 ]; then
|
||||
$log_func "git pull成功:${pull_output}"
|
||||
return 0
|
||||
else
|
||||
$log_func "ERROR: git pull失败(退出码${exit_code}):${pull_output}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# ==============================================
|
||||
# 参数解析:区分执行周期(每周/每月)
|
||||
# ==============================================
|
||||
@ -105,6 +134,7 @@ fi
|
||||
# 每月任务
|
||||
if [ "${PERIOD}" = "--monthly" ]; then
|
||||
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||
register_spider "pbox" "scrapy crawl javhd -a mod='update' "
|
||||
fi
|
||||
|
||||
|
||||
@ -157,6 +187,13 @@ if ! acquire_lock; then
|
||||
fi
|
||||
log "成功获取执行锁,开始执行任务"
|
||||
|
||||
# 拉取最新代码(关键步骤:失败则终止执行)
|
||||
if ! git_pull "${GIT_PROJ_DIR}" log; then
|
||||
log "ERROR: 代码更新失败,终止后续执行"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# 第二步:检查注册任务
|
||||
if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then
|
||||
log "ERROR: 未注册任何${PERIOD#--}Spider,脚本终止"
|
||||
|
||||
@ -6,11 +6,15 @@
|
||||
#
|
||||
|
||||
|
||||
SPIDER_NAME_SIS = 'sis'
|
||||
SPIDER_NAME_U3C3 = 'u3c3'
|
||||
SPIDER_NAME_IAFD = 'iafd'
|
||||
SPIDER_NAME_PBOX = 'pbox'
|
||||
SPIDER_NAME_CLM = 'clm'
|
||||
SPIDER_NAME_SIS = 'sis'
|
||||
SPIDER_NAME_U3C3 = 'u3c3'
|
||||
SPIDER_NAME_CLM = 'clm'
|
||||
SPIDER_NAME_IAFD = 'iafd'
|
||||
SPIDER_NAME_PBOX = 'pbox'
|
||||
SPIDER_NAME_JAVHD = 'javhd'
|
||||
SPIDER_NAME_JAVDB = 'javdb'
|
||||
SPIDER_NAME_JAVBUS = 'javbus'
|
||||
SPIDER_NAME_LORD = 'lord'
|
||||
|
||||
ITEM_TYPE_LIST = 'list'
|
||||
ITEM_TYPE_STUDIO = 'studio'
|
||||
|
||||
@ -561,4 +561,51 @@ class PboxDBHandler(SQLiteDBHandler):
|
||||
|
||||
def close_spider(self, spider):
|
||||
# 关闭数据库连接
|
||||
self.conn.close()
|
||||
self.conn.close()
|
||||
|
||||
|
||||
|
||||
@register_handler(comm.SPIDER_NAME_JAVHD)
|
||||
class JavHDDBHandler(SQLiteDBHandler):
|
||||
def __init__(self, db_path=shared_db_path):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_javhd = 'javhd_models'
|
||||
|
||||
def insert_item(self, item):
|
||||
if item['item_type'] == comm.ITEM_TYPE_ACTOR_INDEX:
|
||||
self.insert_or_update_common(item, self.tbl_name_javhd, uniq_key='url', exists_do_nothing=False)
|
||||
elif item['item_type'] == comm.ITEM_TYPE_ACTOR_DETAIL:
|
||||
self.insert_or_update_common(item, self.tbl_name_javhd, uniq_key='url', exists_do_nothing=False)
|
||||
else:
|
||||
logging.error(f"unkown item.")
|
||||
|
||||
return item
|
||||
|
||||
# 统计函数
|
||||
def get_stat(self):
|
||||
try:
|
||||
self.cursor.execute(f"""
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM {self.tbl_name_javhd}) AS cnt
|
||||
""")
|
||||
|
||||
row = self.cursor.fetchone()
|
||||
if not row:
|
||||
logging.warning(f"query no results.")
|
||||
return {}
|
||||
|
||||
columns = [desc[0] for desc in self.cursor.description]
|
||||
return dict(zip(columns, row))
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"query error: {e}")
|
||||
return {}
|
||||
|
||||
def has_full_data(self, href):
|
||||
try:
|
||||
self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_javhd} WHERE is_full_data=1 and url = ?", (href,))
|
||||
row = self.cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"query error: {e}")
|
||||
return 0
|
||||
|
||||
@ -77,7 +77,7 @@ class StatsExtension:
|
||||
# 获取当前统计信息
|
||||
stats = self.stats.get_stats()
|
||||
# 获取spider自定义的信息
|
||||
spider_stat = {'sp': '-------'}
|
||||
spider_stat = {'task': '-------'}
|
||||
prefix = f"{self.spider_name}/"
|
||||
for key, value in stats.items():
|
||||
if key.startswith(prefix):
|
||||
|
||||
@ -163,3 +163,33 @@ class ClmKeywordsIndexItem(scrapy.Item):
|
||||
index_id = scrapy.Field()
|
||||
wid_iid = scrapy.Field()
|
||||
tags = scrapy.Field()
|
||||
|
||||
class JavHDActorIndexItem(scrapy.Item):
|
||||
item_type = scrapy.Field()
|
||||
rank = scrapy.Field()
|
||||
ja_name = scrapy.Field()
|
||||
zh_name = scrapy.Field()
|
||||
en_name = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
pic = scrapy.Field()
|
||||
is_full_data = scrapy.Field()
|
||||
|
||||
|
||||
class JavHDActorItem(scrapy.Item):
|
||||
item_type = scrapy.Field()
|
||||
rank = scrapy.Field()
|
||||
ja_name = scrapy.Field()
|
||||
zh_name = scrapy.Field()
|
||||
en_name = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
pic = scrapy.Field()
|
||||
height = scrapy.Field()
|
||||
weight = scrapy.Field()
|
||||
breast_size = scrapy.Field()
|
||||
breast_factor = scrapy.Field()
|
||||
hair_color = scrapy.Field()
|
||||
eye_color = scrapy.Field()
|
||||
birth_date = scrapy.Field()
|
||||
ethnicity = scrapy.Field()
|
||||
birth_place = scrapy.Field()
|
||||
is_full_data = scrapy.Field()
|
||||
@ -22,9 +22,8 @@ class IAFDSpider(BaseSpider):
|
||||
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||
self.cmd_str = cmd
|
||||
self.update = int(update)
|
||||
self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}")
|
||||
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||||
|
||||
self.cmd_astro = 'astro'
|
||||
self.cmd_birth = 'birth'
|
||||
@ -33,25 +32,28 @@ class IAFDSpider(BaseSpider):
|
||||
self.cmd_stu = 'stu'
|
||||
self.cmd_performers = 'performers'
|
||||
self.cmd_movies = 'movies'
|
||||
self.cmd_list = self.cmd_str.split(',')
|
||||
if len(self.cmd_list) == 0 :
|
||||
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
|
||||
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
|
||||
if cmd and cmd != '':
|
||||
self.cmd_list = cmd.split(',')
|
||||
|
||||
# 入口函数,由基类的方法触发
|
||||
def custom_start_requests(self):
|
||||
# 根据命令字执行
|
||||
if self.cmd_astro in self.cmd_list:
|
||||
self.start_astro()
|
||||
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
|
||||
for req in self.start_astro():
|
||||
yield req # 将子函数的 Request 传递给框架
|
||||
|
||||
# 按生日获取演员列表
|
||||
if self.cmd_birth in self.cmd_list:
|
||||
self.start_birth()
|
||||
for req in self.start_birth():
|
||||
yield req # 将子函数的 Request 传递给框架
|
||||
|
||||
# 获取人种列表
|
||||
if self.cmd_ethnic in self.cmd_list:
|
||||
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
|
||||
|
||||
# 获取 distributors 列表
|
||||
# 获取 distributors 列表
|
||||
if self.cmd_dist in self.cmd_list:
|
||||
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
|
||||
|
||||
@ -68,6 +70,8 @@ class IAFDSpider(BaseSpider):
|
||||
# 读取待更新的演员列表
|
||||
if self.cmd_performers in self.cmd_list:
|
||||
actors = db_tools.get_performers(**query_args)
|
||||
self.crawler.stats.set_value(f"{self.name}/actor_all", len(actors) if actors else 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
|
||||
if actors:
|
||||
for item in actors:
|
||||
href = item.get('href', '')
|
||||
@ -78,6 +82,8 @@ class IAFDSpider(BaseSpider):
|
||||
# 读取待更新的影片列表
|
||||
if self.cmd_movies in self.cmd_list:
|
||||
movies = db_tools.get_movies(**query_args)
|
||||
self.crawler.stats.set_value(f"{self.name}/movies_all", len(movies) if movies else 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/movies_done", 0)
|
||||
if movies:
|
||||
for item in movies:
|
||||
href = item.get('href', '')
|
||||
@ -155,6 +161,8 @@ class IAFDSpider(BaseSpider):
|
||||
div_root = response.css('select#ethnicity1')
|
||||
if div_root:
|
||||
options = div_root.css('option')
|
||||
self.crawler.stats.set_value(f"{self.name}/ethnic_all", len(options))
|
||||
self.crawler.stats.set_value(f"{self.name}/ethnic_done", 0)
|
||||
for option in options:
|
||||
href = option.attrib.get('value')
|
||||
text = option.css('::text').get().strip()
|
||||
@ -190,6 +198,9 @@ class IAFDSpider(BaseSpider):
|
||||
if next_page:
|
||||
next_url = self.host_url + next_page.attrib['href']
|
||||
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
|
||||
else:
|
||||
self.crawler.stats.inc_value(f"{self.name}/ethnic_done")
|
||||
self.logger.info(f"ethnic ({ethnic}) all fetched. curr url: {response.url}")
|
||||
|
||||
def parse_distributors_list_page(self, response):
|
||||
select_element = response.css('select[name="Distrib"]')
|
||||
|
||||
191
scrapy_proj/scrapy_proj/spiders/javhd_spider.py
Normal file
191
scrapy_proj/scrapy_proj/spiders/javhd_spider.py
Normal file
@ -0,0 +1,191 @@
|
||||
import scrapy
|
||||
import sys
|
||||
import re
|
||||
from urllib.parse import urljoin, quote_plus
|
||||
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file, replace_lang_param
|
||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||
from scrapy_proj.items import JavHDActorIndexItem, JavHDActorItem
|
||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_JAVHD, ITEM_TYPE_ACTOR_INDEX, ITEM_TYPE_ACTOR_DETAIL
|
||||
from scrapy_proj.db_wapper.spider_db_handler import JavHDDBHandler
|
||||
|
||||
db_tools = JavHDDBHandler()
|
||||
|
||||
class JavhdSpider(BaseSpider):
|
||||
name = SPIDER_NAME_JAVHD
|
||||
allowed_domains = ["www.javhd.com", "javhd.com"]
|
||||
|
||||
# 区分POST和GET请求的头部配置
|
||||
custom_settings = {
|
||||
# POST请求头(列表页专用)
|
||||
"POST_HEADERS": {
|
||||
"accept": "application/json, text/plain, */*",
|
||||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"content-type": "application/json",
|
||||
"origin": "https://javhd.com",
|
||||
"referer": "https://javhd.com/zh/model",
|
||||
"sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": "\"macOS\"",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0",
|
||||
"x-requested-with": "XMLHttpRequest"
|
||||
},
|
||||
# GET请求头(详情页专用)
|
||||
"GET_HEADERS": {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||||
"priority": "u=0, i",
|
||||
"referer": "https://javhd.com/zh/model/popular",
|
||||
"sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": "\"macOS\"",
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-user": "?1",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0"
|
||||
},
|
||||
"COOKIES_ENABLED": True
|
||||
}
|
||||
|
||||
def __init__(self, debug='false', mod='update', *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||
self.update_mod = False if mod and mod.lower() == 'force' else True
|
||||
|
||||
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||||
|
||||
# 入口函数,由基类的方法触发
|
||||
def custom_start_requests(self):
|
||||
lang_list = ['en', 'zh', 'ja']
|
||||
for lang in lang_list:
|
||||
url = f"https://javhd.com/{lang}/model"
|
||||
# 列表页请求为POST,携带空JSON数据(对应curl的--data-raw '{}')
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
method="POST",
|
||||
body="{}", # POST请求体(空JSON)
|
||||
headers=self.settings.get("POST_HEADERS"), # 使用POST头
|
||||
callback=self.parse_list,
|
||||
meta={'lang':lang, 'current_page': 1} # 记录当前页码,用于计算排名
|
||||
)
|
||||
|
||||
def parse_list(self, response):
|
||||
"""解析列表页JSON,提取模型列表并跟进详情页"""
|
||||
try:
|
||||
# 解析JSON响应(对应curl返回的结果)
|
||||
data = response.json()
|
||||
except Exception as e:
|
||||
self.logger.error(f"列表页JSON解析失败: {e}, 响应内容: {response.text[:500]}")
|
||||
return
|
||||
|
||||
# 获取当前页码(从meta中读取)
|
||||
lang = response.meta.get("lang", 'en')
|
||||
current_page = response.meta.get("current_page", 1)
|
||||
self.logger.info(f"开始解析第 {current_page} 页数据, url: {response.url}")
|
||||
|
||||
template = data.get("template", "")
|
||||
thumb_components = re.findall(r'<thumb-component[^>]*>', template)
|
||||
num = current_page
|
||||
for idx, thumb in enumerate(thumb_components, start=1):
|
||||
# 计算排名(当前页-1)* 每页数量 + 索引
|
||||
rank = (num - 1) * 36 + idx
|
||||
|
||||
# 提取字段(复用你的正则逻辑)
|
||||
link_content = re.search(r'link-content="(.*?)"', thumb)
|
||||
url_thumb = re.search(r'url-thumb="(.*?)"', thumb)
|
||||
title = re.search(r'title="(.*?)"', thumb)
|
||||
|
||||
# 过滤无效数据
|
||||
if not url_thumb or not title:
|
||||
self.logger.warning(f"排名 {rank} 数据不完整,跳过 | 原始数据: {thumb}")
|
||||
continue
|
||||
|
||||
# 提取字段值
|
||||
pic = url_thumb.group(1)
|
||||
name = title.group(1)
|
||||
url = link_content.group(1) if link_content else ""
|
||||
|
||||
item = JavHDActorIndexItem()
|
||||
item['item_type'] = ITEM_TYPE_ACTOR_INDEX
|
||||
item['rank'] = rank
|
||||
item['url'] = url
|
||||
item[f'{lang}_name'] = name
|
||||
#TODO: 非英语的页面,要去更新对应的名字
|
||||
if lang != 'en':
|
||||
item['url'] = replace_lang_param(item['url'])
|
||||
yield item
|
||||
|
||||
# 只有英语的才会发起详情
|
||||
if url and lang == 'en':
|
||||
actor_exists = 0 if not self.update_mod else db_tools.has_full_data(url)
|
||||
if actor_exists < 1 :
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
headers=self.settings.get("GET_HEADERS"), # 使用GET头
|
||||
callback=self.parse_detail,
|
||||
meta={"list_item": item} # 传递列表页数据到详情页
|
||||
)
|
||||
else:
|
||||
self.logger.info(f"actor(name) has full data. skip. url: {url}")
|
||||
|
||||
# 获取下一页
|
||||
next_path = data.get("pagination_params", {}).get("next")
|
||||
if next_path:
|
||||
current_url = urljoin(response.url, next_path)
|
||||
yield scrapy.Request(
|
||||
url=current_url,
|
||||
method="POST",
|
||||
body="{}", # POST请求体(空JSON)
|
||||
headers=self.settings.get("POST_HEADERS"), # 使用POST头
|
||||
callback=self.parse_list,
|
||||
meta={'lang':lang, 'current_page': current_page+1} # 记录当前页码,用于计算排名
|
||||
)
|
||||
else:
|
||||
self.logger.info(f"列表爬取完成, url: {response.url}")
|
||||
|
||||
|
||||
def parse_detail(self, response):
|
||||
list_item = response.meta.get("list_item", {})
|
||||
info_section = response.css("div.info__features")
|
||||
if not info_section:
|
||||
self.logger.warning(f"未找到 info__features 区块: {href}")
|
||||
return None
|
||||
|
||||
FIELD_MAPPING = {
|
||||
"Height": "height",
|
||||
"Weight": "weight",
|
||||
"Breast size": "breast_size",
|
||||
"Breast factor": "breast_factor",
|
||||
"Hair color": "hair_color",
|
||||
"Eye color": "eye_color",
|
||||
"Birth date": "birth_date",
|
||||
"Ethnicity": "ethnicity",
|
||||
"Birth place": "birth_place"
|
||||
}
|
||||
|
||||
item = JavHDActorItem()
|
||||
item['item_type'] = ITEM_TYPE_ACTOR_DETAIL
|
||||
item['url'] = response.url
|
||||
item['is_full_data'] = 1
|
||||
item['rank'] = list_item['rank']
|
||||
|
||||
# 提取 h1.title 中的文本(演员名称)
|
||||
item['en_name'] = response.css("div.header__info h1.title::text").get(default="").strip()
|
||||
# 提取 img 的 src 属性(演员头像URL)
|
||||
item['pic'] = response.css("div.header__info div.avatar img::attr(src)").get(default="").strip()
|
||||
|
||||
# 遍历所有列表项,使用Scrapy的css选择器
|
||||
for li in info_section.css("li.content-desc__list-item"):
|
||||
# 处理文本(复用process_paragraph方法)
|
||||
title = extract_text_from_element(li.css("strong.content-desc__list-title"))
|
||||
value = extract_text_from_element(li.css("span.content-desc__list-text"))
|
||||
|
||||
if title and value:
|
||||
# 通过映射表转换为数据库字段名
|
||||
db_field = FIELD_MAPPING.get(title)
|
||||
if db_field:
|
||||
item[db_field] = value
|
||||
|
||||
self.logger.info(f"fetch actor({item['en_name']}) data. url: {response.url}")
|
||||
yield item
|
||||
@ -2,6 +2,7 @@ import re
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
|
||||
|
||||
def load_json_file(file_path):
|
||||
# 检查文件是否存在
|
||||
@ -101,4 +102,30 @@ def parse_date_to_datetime(date_str):
|
||||
return datetime.strptime(date_str, format_str)
|
||||
|
||||
# 如果所有格式都不匹配,抛出错误
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def replace_lang_param(url: str) -> str:
|
||||
"""
|
||||
将URL中的lang参数统一替换为'en',支持路径中包含lang的情况
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
|
||||
# 处理路径中的lang参数(如 /ja/model/... 或 /en/model/...)
|
||||
path_parts = parsed.path.split('/')
|
||||
if len(path_parts) >= 2 and path_parts[1] in ['en', 'ja', 'zh']:
|
||||
path_parts[1] = 'en' # 替换第二个路径段为'en'
|
||||
new_path = '/'.join(path_parts)
|
||||
else:
|
||||
new_path = parsed.path
|
||||
|
||||
# 处理查询参数中的lang(如有)
|
||||
query = parse_qs(parsed.query)
|
||||
|
||||
# 构建新URL
|
||||
new_parsed = parsed._replace(
|
||||
path=new_path,
|
||||
query=urlencode(query, doseq=True)
|
||||
)
|
||||
return urlunparse(new_parsed)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user