Skip to content

Instantly share code, notes, and snippets.

@hxr521741
Created June 16, 2025 02:12
Show Gist options
  • Save hxr521741/d9887c279cb2be2714ed23f21c1594f5 to your computer and use it in GitHub Desktop.
Save hxr521741/d9887c279cb2be2714ed23f21c1594f5 to your computer and use it in GitHub Desktop.
import json
import random
def generate_instructions_from_manual(n=2000):
operations = ["查看", "查询", "检查", "重启", "配置", "创建", "删除", "安装"]
resources = ["CPU", "内存", "磁盘", "用户"]
services = ["auditd", "systemd", "ntp", "apache", "openldap", "samba"]
times = ["五分钟", "一小时", "实时"]
files = ["/etc/audit/auditd.conf", "/var/log/audit/audit.log"]
intents_map = {
"查看": "monitor", "查询": "monitor", "检查": "monitor",
"重启": "restart", "配置": "configure", "创建": "manage", "删除": "manage", "安装": "install"
}
data = []
random.seed(42)
for _ in range(n):
op = random.choice(operations)
res = random.choice(resources + services + files)
time = random.choice(times) if op in ["查看", "查询", "检查"] else ""
intent_base = intents_map.get(op, "other")
intent = f"{intent_base}_{res.lower().replace('/', '_')}" if intent_base != "other" else "other"
instruction = f"{op} {res} {time}".strip()
single_turn = {
"text": instruction,
"intent": intent,
"entities": [
{"label": "OPERATION", "text": op},
{"label": "RESOURCE" if res in resources else "SERVICE" if res in services else "FILE", "text": res}
] + ([{"label": "TIME", "text": time}] if time else []),
"context": []
}
data.append(single_turn)
if random.random() < 0.1:
follow_up_ops = ["哪些进程占用最高?", "查看日志", "诊断问题"]
follow_up = random.choice(follow_up_ops)
follow_up_intent = {
"哪些进程占用最高?": "list_top_processes",
"查看日志": "query_log",
"诊断问题": "diagnose_issue"
}[follow_up]
multi_turn = {
"text": follow_up,
"intent": follow_up_intent,
"entities": [{"label": "OPERATION", "text": follow_up.split(" ")[0]},
{"label": "RESOURCE", "text": follow_up.split(" ")[1]}],
"context": [{"user": instruction, "response": f"{instruction}已执行"}]
}
data.append(multi_turn)
return data
def save_to_jsonl(data, file_path="KylinSmartOps_2000_Instructions.jsonl"):
with open(file_path, 'w', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
data = generate_instructions_from_manual(2000)
save_to_jsonl(data)
@hxr521741
Copy link
Author

import json
import random

def generate_instructions_from_manual(n=2000):
operations = ["查看", "查询", "检查", "重启", "配置", "创建", "删除", "安装"]
resources = ["CPU", "内存", "磁盘", "用户"]
services = ["auditd", "systemd", "ntp", "apache", "openldap", "samba"]
times = ["五分钟", "一小时", "实时"]
files = ["/etc/audit/auditd.conf", "/var/log/audit/audit.log"]
intents_map = {
"查看": "monitor", "查询": "monitor", "检查": "monitor",
"重启": "restart", "配置": "configure", "创建": "manage", "删除": "manage", "安装": "install"
}

data = []
random.seed(42)
for _ in range(n):
    op = random.choice(operations)
    res = random.choice(resources + services + files)
    time = random.choice(times) if op in ["查看", "查询", "检查"] else ""
    intent_base = intents_map.get(op, "other")
    intent = f"{intent_base}_{res.lower().replace('/', '_')}" if intent_base != "other" else "other"
    instruction = f"{op} {res} {time}".strip()
    
    single_turn = {
        "text": instruction,
        "intent": intent,
        "entities": [
            {"label": "OPERATION", "text": op},
            {"label": "RESOURCE" if res in resources else "SERVICE" if res in services else "FILE", "text": res}
        ] + ([{"label": "TIME", "text": time}] if time else []),
        "context": []
    }
    data.append(single_turn)
    
    if random.random() < 0.1:
        follow_up_ops = ["哪些进程占用最高?", "查看日志", "诊断问题"]
        follow_up = random.choice(follow_up_ops)
        follow_up_intent = {
            "哪些进程占用最高?": "list_top_processes",
            "查看日志": "query_log",
            "诊断问题": "diagnose_issue"
        }[follow_up]
        multi_turn = {
            "text": follow_up,
            "intent": follow_up_intent,
            "entities": [{"label": "OPERATION", "text": follow_up.split(" ")[0]},
                         {"label": "RESOURCE", "text": follow_up.split(" ")[1]}],
            "context": [{"user": instruction, "response": f"{instruction}已执行"}]
        }
        data.append(multi_turn)

return data

def save_to_jsonl(data, file_path="KylinSmartOps_2000_Instructions.jsonl"):
with open(file_path, 'w', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')

data = generate_instructions_from_manual(2000)
save_to_jsonl(data)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment