modify scripts

2025-07-17 11:46:55 +08:00
parent 77bb81b477
commit 9b28efd2c9
3 changed files with 163 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,9 @@ log/
 logs/
 *.log
 # 忽略数据文件
 tools/data/*
 # 忽略编译后的二进制文件
 bin/*
 obj/
--- a/3
+++ b/3
@ -3,6 +3,9 @@ log/
 logs/
 *.log
 # 忽略数据文件
 tools/data/*
 # 忽略编译后的二进制文件
 bin/*
 obj/
--- a/tools/puzzle.py
+++ b/tools/puzzle.py
@ -0,0 +1,157 @@
 import os
 import json
 import random
 import time
 import logging
 from pathlib import Path
 from openai import AzureOpenAI
 endpoint = "https://grammar.openai.azure.com/"
 model_name = "gpt-4o"
 deployment = "gpt4"
 subscription_key = "8b68c235b737488ab9a99983a14f8cca"
 api_version = "2024-12-01-preview"
 client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
 )
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    handlers=[logging.StreamHandler()]
 )
 BASE_DIR = './data'
 WORDS_DIR = f"{BASE_DIR}/EOWL-v1.1.2/LF Delimited Format"
 RESULT_DIR = f"{BASE_DIR}/result"
 os.makedirs(RESULT_DIR, exist_ok=True)
 TEMP_FILE = f"{BASE_DIR}/temp_words.txt"
 batch_words_size = 100
 def find_words_files(folder):
    txt_files = []
    for f in Path(folder).glob("*.txt"):
        if "Words" in f.name:
            txt_files.append(f)
    return txt_files
 def collect_words(files):
    words_set = set()
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if len(word) >= 3:
                    words_set.add(word)
    return list(words_set)
 def write_temp(words):
    with open(TEMP_FILE, 'w', encoding='utf-8') as f:
        for word in words:
            f.write(word + '\n')
 def read_batches(batch_size=batch_words_size):
    with open(TEMP_FILE, 'r', encoding='utf-8') as f:
        words = [line.strip() for line in f if line.strip()]
    for i in range(0, len(words), batch_size):
        yield words[i:i+batch_size]
 '''Please respond with pure JSON only, without any formatting or explanations.'''
 def build_prompt(words):
    word_list = ", ".join(words)
    prompt = f"""
 Please analyze the following list of English words and do the following:
 1. Classify each word into a theme (like Animals, Plants, Materials, Body Parts, Clothes & Accessories, Food & Drinks, Places, Transportation, Sports, Colors, Numbers, Emotions, Tools, People & Occupations, etc.).
 2. Identify the part of speech of each word (verb, noun, adjective, etc.).
 3. Mark the frequency of usage of each word in everyday English as High, Medium, or Low.
 4. Identify words with the same word root and group them.
 Please response with pure JSON only, without any formatting or explanations.
 Each object should have the keys: word, theme, part_of_speech, frequency, same_root_group.
 Here are the words:
 {word_list}
 """
    return prompt
 def call_openai_with_retry(prompt, retries=3, delay=5):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": "You are an expert English linguist and lexicographer."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=16000,
                temperature=0.7,
                top_p=1.0,
                model=deployment
            )
            #return response.choices[0].message.content.strip()
            text = response.choices[0].message.content.strip()
            # 如果还有 ```json 开头的，去掉
            if text.startswith("```json"):
                text = text[7:-3].strip()
            return text
        except Exception as e:
            logging.warning(f"OpenAI request failed (attempt {attempt+1}): {e}")
            time.sleep(delay)
    logging.error("OpenAI request failed after all retries.")
    return None
 def save_result(index, req, resp, is_json):
    matched = True if is_json and len(req) == len(resp) else False
    flag = "json" if is_json else "txt"
    match_str = "matched" if matched else 'notmatch'
    filename = f"{RESULT_DIR}/{str(index).zfill(5)}_{match_str}_{flag}.json"
    data = {
        'req_len': len(req), 
        'rsp_len': len(resp) if is_json else 0, 
        'match':matched, 
        'req': req, 
        'rsp': resp
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    logging.info(f"Saved result to {filename}")
 def process_folder(folder):
    files = find_words_files(folder)
    logging.info(f"Found {len(files)} files to process.")
    words = collect_words(files)
    logging.info(f"Collected {len(words)} unique words.")
    write_temp(words)
    for idx, batch in enumerate(read_batches(), 1):
        logging.info(f"Processing batch {idx} with {len(batch)} words")
        prompt = build_prompt(batch)
        resp_text = call_openai_with_retry(prompt)
        if resp_text is None:
            save_result(idx, batch, "Failed to get response", False)
            continue
        try:
            resp_json = json.loads(resp_text)
            save_result(idx, batch, resp_json, True)
        except json.JSONDecodeError:
            logging.warning(f"Batch {idx} response is not valid JSON.")
            save_result(idx, batch, resp_text, False)
        time.sleep(2)  # 每批之间暂停
 if __name__ == "__main__":
    process_folder(WORDS_DIR)