modify scripts
This commit is contained in:
@ -84,13 +84,17 @@ class ClmDBHandler(SQLiteDBHandler):
|
|||||||
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
|
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
|
||||||
self.insert_index(item)
|
self.insert_index(item)
|
||||||
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
|
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
|
||||||
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=False)
|
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key='words', exists_do_nothing=False)
|
||||||
else:
|
else:
|
||||||
logging.error(f"unkown item.")
|
logging.error(f"unkown item.")
|
||||||
|
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def insert_index(self, item):
|
def insert_index(self, item):
|
||||||
|
if item['is_update']: # 仅更新
|
||||||
|
self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=False)
|
||||||
|
return
|
||||||
|
|
||||||
row_id = self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True)
|
row_id = self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True)
|
||||||
if row_id:
|
if row_id:
|
||||||
lnk_data = {}
|
lnk_data = {}
|
||||||
@ -106,6 +110,19 @@ class ClmDBHandler(SQLiteDBHandler):
|
|||||||
else:
|
else:
|
||||||
logging.warning(f"insert index error: {item}")
|
logging.warning(f"insert index error: {item}")
|
||||||
|
|
||||||
|
def get_empty_title(self):
|
||||||
|
try:
|
||||||
|
self.cursor.execute(f"SELECT id, href FROM {self.tbl_name_clm_index} WHERE title='' ")
|
||||||
|
return [dict(row) for row in self.cursor.fetchall()]
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_count_by_keywords_id(self, key_words_id):
|
||||||
|
self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_words_index} WHERE words_id = ?", (key_words_id,))
|
||||||
|
row = self.cursor.fetchone()
|
||||||
|
return row[0] if row else None
|
||||||
|
|
||||||
# 按条件查询 href 列表
|
# 按条件查询 href 列表
|
||||||
def get_key_words(self, **filters):
|
def get_key_words(self, **filters):
|
||||||
try:
|
try:
|
||||||
@ -128,6 +145,9 @@ class ClmDBHandler(SQLiteDBHandler):
|
|||||||
else:
|
else:
|
||||||
params.append(filters[key])
|
params.append(filters[key])
|
||||||
|
|
||||||
|
if "query_str" in filters:
|
||||||
|
sql += f" AND {filters['query_str']} "
|
||||||
|
|
||||||
if "order_by" in filters:
|
if "order_by" in filters:
|
||||||
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
|
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
|
||||||
sql += f" ORDER BY {filters['order_by']} "
|
sql += f" ORDER BY {filters['order_by']} "
|
||||||
|
|||||||
@ -155,6 +155,7 @@ class ClmIndexItem(scrapy.Item):
|
|||||||
last_down_date = scrapy.Field()
|
last_down_date = scrapy.Field()
|
||||||
key_words_id = scrapy.Field()
|
key_words_id = scrapy.Field()
|
||||||
key_words = scrapy.Field()
|
key_words = scrapy.Field()
|
||||||
|
is_update = scrapy.Field()
|
||||||
|
|
||||||
class ClmKeywordsIndexItem(scrapy.Item):
|
class ClmKeywordsIndexItem(scrapy.Item):
|
||||||
item_type = scrapy.Field()
|
item_type = scrapy.Field()
|
||||||
|
|||||||
@ -8,35 +8,6 @@ from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM
|
|||||||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler
|
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler
|
||||||
|
|
||||||
db_clm = ClmDBHandler()
|
db_clm = ClmDBHandler()
|
||||||
db_comm = IAFDDBHandler()
|
|
||||||
|
|
||||||
default_keywords = [
|
|
||||||
{'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']},
|
|
||||||
{'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']},
|
|
||||||
{'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']},
|
|
||||||
{'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']},
|
|
||||||
{'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']},
|
|
||||||
{'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']},
|
|
||||||
{'Brazzers' : ['Real Wife Stories', 'brazzers']},
|
|
||||||
{'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']},
|
|
||||||
{'BangBros' : ['BangBus', 'BangBros']},
|
|
||||||
{'Nubile Films' : ['nfbusty', 'NubileFilms']},
|
|
||||||
{'DDF Network' : ['DDFBusty']},
|
|
||||||
{'Adult Time (Network)' : ['AdultTime', 'BurningAngel']},
|
|
||||||
{'Anal Vids' : ['AnalVids']},
|
|
||||||
{'LegalPorno' : ['LegalPorno']},
|
|
||||||
{'Pornbox' : ['Pornworld']},
|
|
||||||
{'Wow (Network)' : ['WowGirls']},
|
|
||||||
{'Malibu Media' : ['x-art']},
|
|
||||||
{'VIPissy Cash' : ['VIPissy']},
|
|
||||||
{'japan Blu-Ray' : ['Japan AV Blu-Ray']},
|
|
||||||
{'siterip' : ['siterip']},
|
|
||||||
{'Brazil' : ['NewMFX']},
|
|
||||||
{'Wicked' : ['Wicked']},
|
|
||||||
{'Sticky Dollars' : ['Swallowed']},
|
|
||||||
{'ManyVids' : ['ManyVids']},
|
|
||||||
{'PervCity' : ['AnalOverdose']}
|
|
||||||
]
|
|
||||||
|
|
||||||
class ClmSpider(BaseSpider):
|
class ClmSpider(BaseSpider):
|
||||||
name = SPIDER_NAME_CLM
|
name = SPIDER_NAME_CLM
|
||||||
@ -51,6 +22,7 @@ class ClmSpider(BaseSpider):
|
|||||||
self.keywords = keywords
|
self.keywords = keywords
|
||||||
self.min_size = float(min_size) if min_size else 1.0
|
self.min_size = float(min_size) if min_size else 1.0
|
||||||
self.run_task = True
|
self.run_task = True
|
||||||
|
self.fix_title = False
|
||||||
|
|
||||||
# 增加一个暗号
|
# 增加一个暗号
|
||||||
if keywords and keywords.lower() == 'reload' :
|
if keywords and keywords.lower() == 'reload' :
|
||||||
@ -59,86 +31,38 @@ class ClmSpider(BaseSpider):
|
|||||||
self.run_task = False
|
self.run_task = False
|
||||||
self.logger.info(f"reload keywords db succ!")
|
self.logger.info(f"reload keywords db succ!")
|
||||||
|
|
||||||
# 指定的关键词,导入到数据库
|
# 增加一个暗号
|
||||||
def initDB(self):
|
if keywords and keywords.lower() == 'fix' :
|
||||||
for row in default_keywords:
|
self.fix_title = True
|
||||||
for group, items in row.items():
|
self.run_task = False
|
||||||
for item in items:
|
|
||||||
words_item = ClmKeyWordsItem()
|
|
||||||
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
|
||||||
words_item['words'] = item
|
|
||||||
words_item['groups'] = group
|
|
||||||
words_item['tags'] = ''
|
|
||||||
words_item['index_count'] = 0
|
|
||||||
db_clm.insert_item(words_item)
|
|
||||||
self.logger.debug(f"insert item: {item}: {group}")
|
|
||||||
|
|
||||||
# 从其他数据源获取到演员列表,导入到数据库
|
|
||||||
def init_load_actors_from_others(self):
|
|
||||||
all_likes = {
|
|
||||||
'vixen' : ['vixen.com', 'Vixen Video'],
|
|
||||||
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
|
|
||||||
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
|
|
||||||
'x-art' : ['x-art.com', 'X-art'],
|
|
||||||
'nfbusty' : ['nfbusty.com']
|
|
||||||
}
|
|
||||||
# 先转换个格式
|
|
||||||
all_key_group = {}
|
|
||||||
all_keys = []
|
|
||||||
for group, keys in all_likes.items():
|
|
||||||
for key in keys:
|
|
||||||
all_key_group[key] = group
|
|
||||||
all_keys.append(key)
|
|
||||||
|
|
||||||
# 查询数据库,并转换数据
|
|
||||||
actor_tags = {}
|
|
||||||
total_lines = 0
|
|
||||||
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
|
|
||||||
for dist, actors in results.items():
|
|
||||||
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
|
|
||||||
total_lines += len(actors)
|
|
||||||
for actor in actors :
|
|
||||||
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
|
|
||||||
actor_name = actor['name']
|
|
||||||
current_tag = all_key_group.get(dist, '')
|
|
||||||
if actor_name not in actor_tags:
|
|
||||||
actor_tags[actor_name] = set() # 用set自动去重
|
|
||||||
if current_tag:
|
|
||||||
actor_tags[actor_name].add(current_tag) # set的add方法,重复值会自动忽略
|
|
||||||
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
|
|
||||||
|
|
||||||
# 查询另一个数据表,获取结果
|
|
||||||
load_results = db_comm.get_lord_actors()
|
|
||||||
if load_results:
|
|
||||||
self.logger.info(f"total actors in lord: {len(load_results)}")
|
|
||||||
for row in load_results:
|
|
||||||
actor_name = row['name']
|
|
||||||
if actor_name not in actor_tags:
|
|
||||||
actor_tags[actor_name] = set() # 用set自动去重
|
|
||||||
actor_tags[actor_name].add('thelordofporn') # set的add方法,重复值会自动忽略
|
|
||||||
|
|
||||||
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
|
|
||||||
for actor, tags_set in actor_tags.items():
|
|
||||||
tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换
|
|
||||||
self.logger.info(f"actor: {actor}, tags: {tag_str}")
|
|
||||||
words_item = ClmKeyWordsItem()
|
|
||||||
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
|
||||||
words_item['words'] = actor
|
|
||||||
words_item['groups'] = 'actress'
|
|
||||||
words_item['tags'] = tag_str
|
|
||||||
words_item['index_count'] = 0
|
|
||||||
db_clm.insert_item(words_item)
|
|
||||||
#self.logger.debug(f"insert item: {words_item}")
|
|
||||||
|
|
||||||
# 入口函数,由基类的方法触发
|
# 入口函数,由基类的方法触发
|
||||||
def custom_start_requests(self):
|
def custom_start_requests(self):
|
||||||
|
if self.fix_title:
|
||||||
|
data = db_clm.get_empty_title()
|
||||||
|
if data:
|
||||||
|
self.logger.info(f"rows to be fixed: {len(data)}")
|
||||||
|
for row in data:
|
||||||
|
url = row['href']
|
||||||
|
# 递归请求下一页
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=url,
|
||||||
|
callback=self.parse_page_detail,
|
||||||
|
meta={'url': url},
|
||||||
|
dont_filter=True # 允许重复请求(防止因URL参数被过滤)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"no data.")
|
||||||
|
|
||||||
if not self.run_task:
|
if not self.run_task:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
tmp_query_str = f" groups='actress' and tags not like '%vixen%' "
|
||||||
if self.debug:
|
if self.debug:
|
||||||
keywords = db_clm.get_key_words(limit =5)
|
keywords = db_clm.get_key_words(limit =5, query_str = tmp_query_str)
|
||||||
else:
|
else:
|
||||||
keywords = db_clm.get_key_words(groups='actress', tags='vixen')
|
#keywords = db_clm.get_key_words(groups='actress', tags='vixen')
|
||||||
|
keywords = db_clm.get_key_words(query_str = tmp_query_str)
|
||||||
|
|
||||||
for item in keywords:
|
for item in keywords:
|
||||||
words_id = item['id']
|
words_id = item['id']
|
||||||
@ -197,7 +121,8 @@ class ClmSpider(BaseSpider):
|
|||||||
# h3 下的 a 标签(标题链接)
|
# h3 下的 a 标签(标题链接)
|
||||||
h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
|
h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
|
||||||
# 标题文本(如 "Vixen.2025.05")
|
# 标题文本(如 "Vixen.2025.05")
|
||||||
title_text = h3_a.xpath('text()').get().strip() if h3_a else None
|
#title_text = h3_a.xpath('text()').get().strip() if h3_a else None
|
||||||
|
title_text = extract_text_from_element(h3_a, use_title=True)
|
||||||
# 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html")
|
# 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html")
|
||||||
title_href = h3_a.xpath('@href').get() if h3_a else None
|
title_href = h3_a.xpath('@href').get() if h3_a else None
|
||||||
# 若链接是相对路径,可拼接成完整URL(根据网站域名调整)
|
# 若链接是相对路径,可拼接成完整URL(根据网站域名调整)
|
||||||
@ -240,6 +165,7 @@ class ClmSpider(BaseSpider):
|
|||||||
item['last_down_date'] = last_download
|
item['last_down_date'] = last_download
|
||||||
item['key_words_id'] = response.meta.get('words_id', 0)
|
item['key_words_id'] = response.meta.get('words_id', 0)
|
||||||
item['key_words'] = response.meta.get('words', '')
|
item['key_words'] = response.meta.get('words', '')
|
||||||
|
item['is_update'] = False
|
||||||
|
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
@ -249,7 +175,7 @@ class ClmSpider(BaseSpider):
|
|||||||
# 解析下一页链接
|
# 解析下一页链接
|
||||||
pager = response.xpath('//div[@class="pager"]')
|
pager = response.xpath('//div[@class="pager"]')
|
||||||
if pager:
|
if pager:
|
||||||
total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get() if sbar else ''
|
total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get()
|
||||||
|
|
||||||
# 定位“下一页”的a标签(通过文本定位,避免混淆其他a标签)
|
# 定位“下一页”的a标签(通过文本定位,避免混淆其他a标签)
|
||||||
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
|
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
|
||||||
@ -270,5 +196,118 @@ class ClmSpider(BaseSpider):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# 当href为#或不存在时,说明已无下一页
|
# 当href为#或不存在时,说明已无下一页
|
||||||
self.logger.info(f'已获取完所有页面,停止翻页. {total_text}')
|
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
|
||||||
|
curr_words = response.meta.get('words', '')
|
||||||
|
self.logger.info(f'已获取完所有页面,停止翻页. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
|
||||||
|
|
||||||
|
def parse_page_detail(self, response):
|
||||||
|
# 匹配 class 为 'bt_title' 的 div 下的 h2 标签文本
|
||||||
|
title_xpath = response.xpath('//div[@class="bt_title"]/h2/text()').get()
|
||||||
|
|
||||||
|
item = ClmIndexItem()
|
||||||
|
item['item_type'] = ITEM_TYPE_CLM_INDEX
|
||||||
|
item['title'] = title_xpath
|
||||||
|
item['href'] = response.meta['url']
|
||||||
|
item['is_update'] = True
|
||||||
|
|
||||||
|
yield item
|
||||||
|
|
||||||
|
|
||||||
|
# 指定的关键词,导入到数据库
|
||||||
|
def initDB(self):
|
||||||
|
default_keywords = [
|
||||||
|
{'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']},
|
||||||
|
{'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']},
|
||||||
|
{'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']},
|
||||||
|
{'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']},
|
||||||
|
{'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']},
|
||||||
|
{'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']},
|
||||||
|
{'Brazzers' : ['Real Wife Stories', 'brazzers']},
|
||||||
|
{'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']},
|
||||||
|
{'BangBros' : ['BangBus', 'BangBros']},
|
||||||
|
{'Nubile Films' : ['nfbusty', 'NubileFilms']},
|
||||||
|
{'DDF Network' : ['DDFBusty']},
|
||||||
|
{'Adult Time (Network)' : ['AdultTime', 'BurningAngel']},
|
||||||
|
{'Anal Vids' : ['AnalVids']},
|
||||||
|
{'LegalPorno' : ['LegalPorno']},
|
||||||
|
{'Pornbox' : ['Pornworld']},
|
||||||
|
{'Wow (Network)' : ['WowGirls']},
|
||||||
|
{'Malibu Media' : ['x-art']},
|
||||||
|
{'VIPissy Cash' : ['VIPissy']},
|
||||||
|
{'japan Blu-Ray' : ['Japan AV Blu-Ray']},
|
||||||
|
{'siterip' : ['siterip']},
|
||||||
|
{'Brazil' : ['NewMFX']},
|
||||||
|
{'Wicked' : ['Wicked']},
|
||||||
|
{'Sticky Dollars' : ['Swallowed']},
|
||||||
|
{'ManyVids' : ['ManyVids']},
|
||||||
|
{'PervCity' : ['AnalOverdose']}
|
||||||
|
]
|
||||||
|
for row in default_keywords:
|
||||||
|
for group, items in row.items():
|
||||||
|
for item in items:
|
||||||
|
words_item = ClmKeyWordsItem()
|
||||||
|
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
||||||
|
words_item['words'] = item
|
||||||
|
words_item['groups'] = group
|
||||||
|
words_item['tags'] = ''
|
||||||
|
words_item['index_count'] = 0
|
||||||
|
db_clm.insert_item(words_item)
|
||||||
|
self.logger.debug(f"insert item: {item}: {group}")
|
||||||
|
|
||||||
|
# 从其他数据源获取到演员列表,导入到数据库
|
||||||
|
def init_load_actors_from_others(self):
|
||||||
|
db_comm = IAFDDBHandler()
|
||||||
|
all_likes = {
|
||||||
|
'vixen' : ['vixen.com', 'Vixen Video'],
|
||||||
|
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
|
||||||
|
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
|
||||||
|
'x-art' : ['x-art.com', 'X-art'],
|
||||||
|
'nfbusty' : ['nfbusty.com']
|
||||||
|
}
|
||||||
|
# 先转换个格式
|
||||||
|
all_key_group = {}
|
||||||
|
all_keys = []
|
||||||
|
for group, keys in all_likes.items():
|
||||||
|
for key in keys:
|
||||||
|
all_key_group[key] = group
|
||||||
|
all_keys.append(key)
|
||||||
|
|
||||||
|
# 查询数据库,并转换数据
|
||||||
|
actor_tags = {}
|
||||||
|
total_lines = 0
|
||||||
|
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
|
||||||
|
for dist, actors in results.items():
|
||||||
|
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
|
||||||
|
total_lines += len(actors)
|
||||||
|
for actor in actors :
|
||||||
|
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
|
||||||
|
actor_name = actor['name']
|
||||||
|
current_tag = all_key_group.get(dist, '')
|
||||||
|
if actor_name not in actor_tags:
|
||||||
|
actor_tags[actor_name] = set() # 用set自动去重
|
||||||
|
if current_tag:
|
||||||
|
actor_tags[actor_name].add(current_tag) # set的add方法,重复值会自动忽略
|
||||||
|
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
|
||||||
|
|
||||||
|
# 查询另一个数据表,获取结果
|
||||||
|
load_results = db_comm.get_lord_actors()
|
||||||
|
if load_results:
|
||||||
|
self.logger.info(f"total actors in lord: {len(load_results)}")
|
||||||
|
for row in load_results:
|
||||||
|
actor_name = row['name']
|
||||||
|
if actor_name not in actor_tags:
|
||||||
|
actor_tags[actor_name] = set() # 用set自动去重
|
||||||
|
actor_tags[actor_name].add('thelordofporn') # set的add方法,重复值会自动忽略
|
||||||
|
|
||||||
|
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
|
||||||
|
for actor, tags_set in actor_tags.items():
|
||||||
|
tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换
|
||||||
|
self.logger.info(f"actor: {actor}, tags: {tag_str}")
|
||||||
|
words_item = ClmKeyWordsItem()
|
||||||
|
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
||||||
|
words_item['words'] = actor
|
||||||
|
words_item['groups'] = 'actress'
|
||||||
|
words_item['tags'] = tag_str
|
||||||
|
words_item['index_count'] = 0
|
||||||
|
db_clm.insert_item(words_item)
|
||||||
|
#self.logger.debug(f"insert item: {words_item}")
|
||||||
|
|||||||
Reference in New Issue
Block a user