清华开源ChatGPT自动编程ChatDev项目statistics.py代码解读

这段代码是一个名为“get_info”的函数，它的作用是读取指定目录下的文件和日志信息，并计算出项目的一些基本信息，最终将这些信息以字符串的形式返回。

具体而言，这个函数接受两个参数：一个是代表目录的字符串“dir”，另一个是代表日志文件路径的字符串“log_filepath”。它首先定义了一些变量，并初始化为-1，这些变量存储的是项目信息，如代码行数、版本更新次数等等。

接下来，函数会检查目录是否存在，如果存在，则获取该目录下的所有文件名；然后统计出其中Python代码文件、PNG图像文件、其他文档文件的数量，并分别赋给相应的变量。当代码文件和PNG图像文件数量不为-1时，还会根据数量计算出费用（每个PNG文件0.016美元，每1000个提示和完成令牌各0.003美元和0.004美元，分别累计），并将计算结果加入费用“cost”的变量中。

同时，该函数还会检查目录下是否存在“meta.txt”、“requirements.txt”、“manual.md”等文件，并分别统计它们的行数，以及计算出代码文件中的代码行数，稍后都会分别保存到对应的变量中。

该函数还会读取日志文件的内容，查找指定的字符串（如“[Start Chat]”和“<->”）来获取与聊天相关的信息（如对话数量“num_utterance”和自我反思数量“num_reflection”），并累加到对应的变量中。此外，该函数还会查找日志中与GPT-3.5模型输入和输出有关的字符串，从中提取提示和完成令牌的数目、总的令牌数等信息，并分别保存到对应变量中。

最后，该函数根据从文件和日志中获取的所有信息，生成一个包含各种统计指标的字符串“info”，并将其返回。该字符串包括项目的版本、代码文件数、PNG图像文件数、其他文档文件数、代码行数、环境配置行数、手动文档行数、对话数量、自我反思数量、提示令牌数、完成令牌数、总令牌数等多个信息，每个信息都以相应的标签和数值的形式呈现。



def get_info(dir, log_filepath):
    print("dir:", dir)

    version_updates = -1
    num_code_files = -1
    num_png_files = -1
    num_doc_files = -1
    code_lines = -1
    env_lines = -1
    manual_lines = -1
    duration = -1
    num_utterance = -1
    num_reflection = -1
    num_prompt_tokens = -1
    num_completion_tokens = -1
    num_total_tokens = -1

    if os.path.exists(dir):
        filenames = os.listdir(dir)
        # print(filenames)

        num_code_files = len([filename for filename in filenames if filename.endswith(".py")])
        # print("num_code_files:", num_code_files)

        num_png_files = len([filename for filename in filenames if filename.endswith(".png")])
        # print("num_png_files:", num_png_files)

        num_doc_files = 0
        for filename in filenames:
            if filename.endswith(".py") or filename.endswith(".png"):
                continue
            if os.path.isfile(os.path.join(dir, filename)):
                # print(filename)
                num_doc_files += 1
        # print("num_doc_files:", num_doc_files)

        if "meta.txt" in filenames:
            lines = open(os.path.join(dir, "meta.txt"), "r", encoding="utf8").read().split("\n")
            version_updates = float([lines[i + 1] for i, line in enumerate(lines) if "Code_Version" in line][0]) + 1
        else:
            version_updates = -1
        # print("version_updates: ", version_updates)

        if "requirements.txt" in filenames:
            lines = open(os.path.join(dir, "requirements.txt"), "r", encoding="utf8").read().split("\n")
            env_lines = len([line for line in lines if len(line.strip()) > 0])
        else:
            env_lines = -1
        # print("env_lines:", env_lines)

        if "manual.md" in filenames:
            lines = open(os.path.join(dir, "manual.md"), "r", encoding="utf8").read().split("\n")
            manual_lines = len([line for line in lines if len(line.strip()) > 0])
        else:
            manual_lines = -1
        # print("manual_lines:", manual_lines)

        code_lines = 0
        for filename in filenames:
            if filename.endswith(".py"):
                # print("......filename:", filename)
                lines = open(os.path.join(dir, filename), "r", encoding="utf8").read().split("\n")
                code_lines += len([line for line in lines if len(line.strip()) > 0])
        # print("code_lines:", code_lines)

        lines = open(log_filepath, "r", encoding="utf8").read().split("\n")
        start_lines = [line for line in lines if "**[Start Chat]**" in line]
        chat_lines = [line for line in lines if "<->" in line]
        num_utterance = len(start_lines) + len(chat_lines)
        # print("num_utterance:", num_utterance)

        lines = open(log_filepath, "r", encoding="utf8").read().split("\n")
        sublines = [line for line in lines if line.startswith("prompt_tokens:")]
        if len(sublines) > 0:
            nums = [int(line.split(": ")[-1]) for line in sublines]
            num_prompt_tokens = np.sum(nums)
            # print("num_prompt_tokens:", num_prompt_tokens)

        lines = open(log_filepath, "r", encoding="utf8").read().split("\n")
        sublines = [line for line in lines if line.startswith("completion_tokens:")]
        if len(sublines) > 0:
            nums = [int(line.split(": ")[-1]) for line in sublines]
            num_completion_tokens = np.sum(nums)
            # print("num_completion_tokens:", num_completion_tokens)

        lines = open(log_filepath, "r", encoding="utf8").read().split("\n")
        sublines = [line for line in lines if line.startswith("total_tokens:")]
        if len(sublines) > 0:
            nums = [int(line.split(": ")[-1]) for line in sublines]
            num_total_tokens = np.sum(nums)
            # print("num_total_tokens:", num_total_tokens)

        lines = open(log_filepath, "r", encoding="utf8").read().split("\n")

        lines = open(log_filepath, "r", encoding="utf8").read().split("\n")
        num_reflection = 0
        for line in lines:
            if "on : Reflection" in line:
                num_reflection += 1
        # print("num_reflection:", num_reflection)

    cost = 0.0
    if num_png_files != -1:
        cost += num_png_files * 0.016
    if num_prompt_tokens != -1:
        cost += num_prompt_tokens * 0.003 / 1000.0
    if num_completion_tokens != -1:
        cost += num_completion_tokens * 0.004 / 1000.0

    # info = f"🕑duration={duration}s 💰cost=${cost} 🔨version_updates={version_updates} 📃num_code_files={num_code_files} 🏞num_png_files={num_png_files} 📚num_doc_files={num_doc_files} 📃code_lines={code_lines} 📋env_lines={env_lines} 📒manual_lines={manual_lines} 🗣num_utterances={num_utterance} 🤔num_self_reflections={num_reflection} ❓num_prompt_tokens={num_prompt_tokens} ❗num_completion_tokens={num_completion_tokens} ⁉️num_total_tokens={num_total_tokens}"

    info = "\n\n💰**cost**=${:.6f}\n\n🔨**version_updates**={}\n\n📃**num_code_files**={}\n\n🏞**num_png_files**={}\n\n📚**num_doc_files**={}\n\n📃**code_lines**={}\n\n📋**env_lines**={}\n\n📒**manual_lines**={}\n\n🗣**num_utterances**={}\n\n🤔**num_self_reflections**={}\n\n❓**num_prompt_tokens**={}\n\n❗**num_completion_tokens**={}\n\n🌟**num_total_tokens**={}" \
        .format(cost,
                version_updates,
                num_code_files,
                num_png_files,
                num_doc_files,
                code_lines,
                env_lines,
                manual_lines,
                num_utterance,
                num_reflection,
                num_prompt_tokens,
                num_completion_tokens,
                num_total_tokens)

    return info