modify scripts
This commit is contained in:
@ -184,17 +184,21 @@ def generate_union_table(mysql_conn):
|
|||||||
for idx, row in enumerate(stash_data):
|
for idx, row in enumerate(stash_data):
|
||||||
# 规范化name字段用于匹配
|
# 规范化name字段用于匹配
|
||||||
key_str = parse_union_key(row["date"], row["name"], row["code"])
|
key_str = parse_union_key(row["date"], row["name"], row["code"])
|
||||||
dict_stash_data[key_str] = idx
|
if key_str not in dict_stash_data:
|
||||||
|
dict_stash_data[key_str] = set()
|
||||||
|
dict_stash_data[key_str].add(idx)
|
||||||
|
stash_data[idx]['matched'] = False # 标记是否已匹配
|
||||||
|
|
||||||
dict_whisper_data = {}
|
dict_whisper_data = {}
|
||||||
for idx, row in enumerate(whisper_data):
|
for idx, row in enumerate(whisper_data):
|
||||||
# 规范化name字段用于匹配
|
# 规范化name字段用于匹配
|
||||||
key_str = parse_union_key(row["release_date"], row["studio_name"], row["whisper_code"])
|
key_str = parse_union_key(row["release_date"], row["studio_name"], row["whisper_code"])
|
||||||
dict_whisper_data[key_str] = idx
|
if key_str not in dict_whisper_data:
|
||||||
|
dict_whisper_data[key_str] = set()
|
||||||
|
dict_whisper_data[key_str].add(idx)
|
||||||
|
|
||||||
# 4. 内存中进行双边匹配
|
# 4. 内存中进行双边匹配
|
||||||
result = []
|
result = []
|
||||||
dict_result_keys = set()
|
|
||||||
count_matched = 0
|
count_matched = 0
|
||||||
count_whisper_only = 0
|
count_whisper_only = 0
|
||||||
count_stash_only = 0
|
count_stash_only = 0
|
||||||
@ -202,27 +206,48 @@ def generate_union_table(mysql_conn):
|
|||||||
# 先处理whisper到stash的匹配(原left join逻辑)
|
# 先处理whisper到stash的匹配(原left join逻辑)
|
||||||
for w in whisper_data:
|
for w in whisper_data:
|
||||||
matched = False
|
matched = False
|
||||||
|
s = None
|
||||||
if w['release_date'] and w['studio_name']:
|
if w['release_date'] and w['studio_name']:
|
||||||
# 对name进行去空格,去特殊字符处理
|
# 对name进行去空格,去特殊字符处理
|
||||||
key_str = parse_union_key(w['release_date'], w['studio_name'], w['whisper_code'])
|
key_str = parse_union_key(w['release_date'], w['studio_name'], w['whisper_code'])
|
||||||
if key_str in dict_stash_data:
|
if key_str in dict_stash_data:
|
||||||
s = stash_data[dict_stash_data[key_str]]
|
set_idx = dict_stash_data[key_str]
|
||||||
result.append({
|
if len(set_idx) == 1:
|
||||||
'whisper_id': w['Id'],
|
idx = set_idx.pop()
|
||||||
'release_year': w['release_year'] or 0,
|
s = stash_data[idx]
|
||||||
'release_date': w['release_date'],
|
stash_data[idx]['matched'] = True
|
||||||
'whisper_code': w['whisper_code'],
|
matched = True
|
||||||
'title': w['title'],
|
else:
|
||||||
'studio_name': w['studio_name'],
|
# 多重匹配时,把code相同且未匹配过的优先匹配
|
||||||
'date': s['date'],
|
for idx in set_idx:
|
||||||
'code': s['code'],
|
if (stash_data[idx]['code'] and w['whisper_code']) and stash_data[idx]['code'].lower() == w['whisper_code'].lower() and not stash_data[idx].get('matched', False):
|
||||||
'stash_title': s['stash_title'],
|
s = stash_data[idx]
|
||||||
'name': s['name']
|
stash_data[idx]['matched'] = True
|
||||||
})
|
matched = True
|
||||||
dict_result_keys.add(key_str)
|
break
|
||||||
count_matched += 1
|
if not matched:
|
||||||
matched = True
|
# 否则匹配第一个未匹配过的
|
||||||
if not matched:
|
for idx in set_idx:
|
||||||
|
if not stash_data[idx].get('matched', False):
|
||||||
|
s = stash_data[idx]
|
||||||
|
stash_data[idx]['matched'] = True
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
if matched and s:
|
||||||
|
result.append({
|
||||||
|
'whisper_id': w['Id'],
|
||||||
|
'release_year': w['release_year'] or 0,
|
||||||
|
'release_date': w['release_date'],
|
||||||
|
'whisper_code': w['whisper_code'],
|
||||||
|
'title': w['title'],
|
||||||
|
'studio_name': w['studio_name'],
|
||||||
|
'date': s['date'],
|
||||||
|
'code': s['code'],
|
||||||
|
'stash_title': s['stash_title'],
|
||||||
|
'name': s['name']
|
||||||
|
})
|
||||||
|
count_matched += 1
|
||||||
|
else:
|
||||||
result.append({
|
result.append({
|
||||||
'whisper_id': w['Id'],
|
'whisper_id': w['Id'],
|
||||||
'release_year': w['release_year'] or 0,
|
'release_year': w['release_year'] or 0,
|
||||||
@ -239,27 +264,22 @@ def generate_union_table(mysql_conn):
|
|||||||
|
|
||||||
# 再处理stash到whisper的反向匹配(新增双边匹配逻辑)
|
# 再处理stash到whisper的反向匹配(新增双边匹配逻辑)
|
||||||
for s in stash_data:
|
for s in stash_data:
|
||||||
matched = False
|
if s['matched']:
|
||||||
if s['date'] and s['name']:
|
continue # 已匹配过的跳过
|
||||||
# 对name进行去空格,去特殊字符处理
|
|
||||||
key_str = parse_union_key(s['date'], s['name'], s['code'])
|
result.append({
|
||||||
if key_str in dict_whisper_data:
|
'whisper_id': 0,
|
||||||
matched = True
|
'release_year': 0,
|
||||||
|
'release_date': None,
|
||||||
if not matched:
|
'whisper_code': None,
|
||||||
result.append({
|
'title': None,
|
||||||
'whisper_id': 0,
|
'studio_name': None,
|
||||||
'release_year': 0,
|
'date': s['date'],
|
||||||
'release_date': None,
|
'code': s['code'],
|
||||||
'whisper_code': None,
|
'stash_title': s['stash_title'],
|
||||||
'title': None,
|
'name': s['name']
|
||||||
'studio_name': None,
|
})
|
||||||
'date': s['date'],
|
count_stash_only += 1
|
||||||
'code': s['code'],
|
|
||||||
'stash_title': s['stash_title'],
|
|
||||||
'name': s['name']
|
|
||||||
})
|
|
||||||
count_stash_only += 1
|
|
||||||
|
|
||||||
print(f"匹配完成:匹配成功 {count_matched} 条,Whisper 独有 {count_whisper_only} 条,Stash 独有 {count_stash_only} 条")
|
print(f"匹配完成:匹配成功 {count_matched} 条,Whisper 独有 {count_whisper_only} 条,Stash 独有 {count_stash_only} 条")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user