modify scripts
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@ -3,6 +3,9 @@ log/
|
|||||||
logs/
|
logs/
|
||||||
*.log
|
*.log
|
||||||
|
|
||||||
|
# 忽略数据文件
|
||||||
|
tools/data/*
|
||||||
|
|
||||||
# 忽略编译后的二进制文件
|
# 忽略编译后的二进制文件
|
||||||
bin/*
|
bin/*
|
||||||
obj/
|
obj/
|
||||||
|
|||||||
@ -3,6 +3,9 @@ log/
|
|||||||
logs/
|
logs/
|
||||||
*.log
|
*.log
|
||||||
|
|
||||||
|
# 忽略数据文件
|
||||||
|
tools/data/*
|
||||||
|
|
||||||
# 忽略编译后的二进制文件
|
# 忽略编译后的二进制文件
|
||||||
bin/*
|
bin/*
|
||||||
obj/
|
obj/
|
||||||
|
|||||||
157
tools/puzzle.py
Normal file
157
tools/puzzle.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from openai import AzureOpenAI
|
||||||
|
|
||||||
|
endpoint = "https://grammar.openai.azure.com/"
|
||||||
|
model_name = "gpt-4o"
|
||||||
|
deployment = "gpt4"
|
||||||
|
|
||||||
|
subscription_key = "8b68c235b737488ab9a99983a14f8cca"
|
||||||
|
api_version = "2024-12-01-preview"
|
||||||
|
|
||||||
|
client = AzureOpenAI(
|
||||||
|
api_version=api_version,
|
||||||
|
azure_endpoint=endpoint,
|
||||||
|
api_key=subscription_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s: %(message)s",
|
||||||
|
handlers=[logging.StreamHandler()]
|
||||||
|
)
|
||||||
|
|
||||||
|
BASE_DIR = './data'
|
||||||
|
WORDS_DIR = f"{BASE_DIR}/EOWL-v1.1.2/LF Delimited Format"
|
||||||
|
RESULT_DIR = f"{BASE_DIR}/result"
|
||||||
|
os.makedirs(RESULT_DIR, exist_ok=True)
|
||||||
|
TEMP_FILE = f"{BASE_DIR}/temp_words.txt"
|
||||||
|
|
||||||
|
batch_words_size = 100
|
||||||
|
|
||||||
|
def find_words_files(folder):
|
||||||
|
txt_files = []
|
||||||
|
for f in Path(folder).glob("*.txt"):
|
||||||
|
if "Words" in f.name:
|
||||||
|
txt_files.append(f)
|
||||||
|
return txt_files
|
||||||
|
|
||||||
|
|
||||||
|
def collect_words(files):
|
||||||
|
words_set = set()
|
||||||
|
for file in files:
|
||||||
|
with open(file, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
word = line.strip()
|
||||||
|
if len(word) >= 3:
|
||||||
|
words_set.add(word)
|
||||||
|
return list(words_set)
|
||||||
|
|
||||||
|
|
||||||
|
def write_temp(words):
|
||||||
|
with open(TEMP_FILE, 'w', encoding='utf-8') as f:
|
||||||
|
for word in words:
|
||||||
|
f.write(word + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def read_batches(batch_size=batch_words_size):
|
||||||
|
with open(TEMP_FILE, 'r', encoding='utf-8') as f:
|
||||||
|
words = [line.strip() for line in f if line.strip()]
|
||||||
|
for i in range(0, len(words), batch_size):
|
||||||
|
yield words[i:i+batch_size]
|
||||||
|
|
||||||
|
'''Please respond with pure JSON only, without any formatting or explanations.'''
|
||||||
|
def build_prompt(words):
|
||||||
|
word_list = ", ".join(words)
|
||||||
|
prompt = f"""
|
||||||
|
Please analyze the following list of English words and do the following:
|
||||||
|
|
||||||
|
1. Classify each word into a theme (like Animals, Plants, Materials, Body Parts, Clothes & Accessories, Food & Drinks, Places, Transportation, Sports, Colors, Numbers, Emotions, Tools, People & Occupations, etc.).
|
||||||
|
2. Identify the part of speech of each word (verb, noun, adjective, etc.).
|
||||||
|
3. Mark the frequency of usage of each word in everyday English as High, Medium, or Low.
|
||||||
|
4. Identify words with the same word root and group them.
|
||||||
|
|
||||||
|
Please response with pure JSON only, without any formatting or explanations.
|
||||||
|
Each object should have the keys: word, theme, part_of_speech, frequency, same_root_group.
|
||||||
|
|
||||||
|
Here are the words:
|
||||||
|
{word_list}
|
||||||
|
"""
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
def call_openai_with_retry(prompt, retries=3, delay=5):
|
||||||
|
for attempt in range(retries):
|
||||||
|
try:
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are an expert English linguist and lexicographer."},
|
||||||
|
{"role": "user", "content": prompt}
|
||||||
|
],
|
||||||
|
max_tokens=16000,
|
||||||
|
temperature=0.7,
|
||||||
|
top_p=1.0,
|
||||||
|
model=deployment
|
||||||
|
)
|
||||||
|
#return response.choices[0].message.content.strip()
|
||||||
|
text = response.choices[0].message.content.strip()
|
||||||
|
# 如果还有 ```json 开头的,去掉
|
||||||
|
if text.startswith("```json"):
|
||||||
|
text = text[7:-3].strip()
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"OpenAI request failed (attempt {attempt+1}): {e}")
|
||||||
|
time.sleep(delay)
|
||||||
|
logging.error("OpenAI request failed after all retries.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def save_result(index, req, resp, is_json):
|
||||||
|
matched = True if is_json and len(req) == len(resp) else False
|
||||||
|
flag = "json" if is_json else "txt"
|
||||||
|
match_str = "matched" if matched else 'notmatch'
|
||||||
|
filename = f"{RESULT_DIR}/{str(index).zfill(5)}_{match_str}_{flag}.json"
|
||||||
|
data = {
|
||||||
|
'req_len': len(req),
|
||||||
|
'rsp_len': len(resp) if is_json else 0,
|
||||||
|
'match':matched,
|
||||||
|
'req': req,
|
||||||
|
'rsp': resp
|
||||||
|
}
|
||||||
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
logging.info(f"Saved result to {filename}")
|
||||||
|
|
||||||
|
|
||||||
|
def process_folder(folder):
|
||||||
|
files = find_words_files(folder)
|
||||||
|
logging.info(f"Found {len(files)} files to process.")
|
||||||
|
words = collect_words(files)
|
||||||
|
logging.info(f"Collected {len(words)} unique words.")
|
||||||
|
write_temp(words)
|
||||||
|
|
||||||
|
for idx, batch in enumerate(read_batches(), 1):
|
||||||
|
logging.info(f"Processing batch {idx} with {len(batch)} words")
|
||||||
|
prompt = build_prompt(batch)
|
||||||
|
resp_text = call_openai_with_retry(prompt)
|
||||||
|
|
||||||
|
if resp_text is None:
|
||||||
|
save_result(idx, batch, "Failed to get response", False)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp_json = json.loads(resp_text)
|
||||||
|
save_result(idx, batch, resp_json, True)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logging.warning(f"Batch {idx} response is not valid JSON.")
|
||||||
|
save_result(idx, batch, resp_text, False)
|
||||||
|
|
||||||
|
time.sleep(2) # 每批之间暂停
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
process_folder(WORDS_DIR)
|
||||||
Reference in New Issue
Block a user