moe_explore/olmoe_log_expert_vllm.py

# %%
import gc
from datetime import datetime
from pathlib import Path

import torch
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationLevel
from vllm.distributed.parallel_state import destroy_model_parallel

from models.register import register_vllm_logit_logging_models
from utils import DataLogger as dlog

# %%
# dlog.get_instance(path=f"olmoe_{datetime.now().strftime("%Y%m%d-%H%M%S")}.parquet")

# %%
model_id = "./llms/OLMoE-1B-7B-0924-Instruct"

try:
    log_file = Path(f"olmoe_{datetime.now().strftime("%Y%m%d-%H%M%S")}.parquet")
    if log_file.exists():
        log_file.unlink()

    dlog.initialize(path=log_file)

    register_vllm_logit_logging_models()

    llm = LLM(
        model=model_id,
        cpu_offload_gb=4,
        # tensor_parallel_size=2,
        gpu_memory_utilization=0.95,
        max_model_len=4096,
        # compilation_config=CompilationConfig(
        #     level=CompilationLevel.PIECEWISE,
        #     # By default, it goes up to max_num_seqs
        #     cudagraph_capture_sizes=[1, 2, 4, 8, 16],
        # ),
        enforce_eager=True,
    )

    sampling_params = SamplingParams(
        temperature=0.6,
        top_p=0.95,
        top_k=20,
        max_tokens=1024,
    )

    # Prepare the input to the model
    prompt = "Give me a very short introduction to large language models."
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ]
    # messages = [
    #     {"role": "system", "content": "你是一位人工智能助手。"},
    #     {"role": "user", "content": "请简要地介绍什么是大语言模型。"},
    # ]

    # Generate outputs
    outputs = llm.chat(
        messages,
        sampling_params=sampling_params,
        # chat_template_kwargs={"enable_thinking": True},  # Set to False to strictly disable thinking
    )

    # Print the outputs.
    for out in outputs:
        # out.prompt is the input prompt; out.outputs is a list of completion choices
        # print("=== PROMPT ===")
        # print(out.prompt)
        # print("=== COMPLETION ===")
        print(out.outputs[0].text)
        print("\n---\n")
        dlog.log({
            "_time": datetime.now(),
            "output_text": out.outputs[0].text
        })

    print("Finish completion")

except Exception as e:
    print(e)

finally:
    if llm := globals().get("llm", None):
        if engine := getattr(llm, "llm_engine", None):
            # llm.llm_engine
            del engine
        del llm
    destroy_model_parallel()
    torch.cuda.empty_cache()
    gc.collect()