Skip to content

Instantly share code, notes, and snippets.

@mrigankpawagi
Last active July 10, 2025 11:01
Show Gist options
  • Select an option

  • Save mrigankpawagi/6fded73b14c3ac88430fb8415c51661a to your computer and use it in GitHub Desktop.

Select an option

Save mrigankpawagi/6fded73b14c3ac88430fb8415c51661a to your computer and use it in GitHub Desktop.
Script to extract information about GitHub's CodeQL queries. Clone github.com/github/codeql and run the script from the root of the repository. It outputs a JSON file called ql_metadata.json and a CSV file called ql_metadata.csv. You need to have the CodeQL CLI installed on your system.
import os
import subprocess
import json
import csv
from pathlib import Path
def get_ql_and_qls_files(root_dir):
ql_files = []
qls_files = []
for dirpath, _, filenames in os.walk(root_dir):
for filename in filenames:
rel_path = os.path.relpath(os.path.join(dirpath, filename), root_dir)
# Only include files in <something>/ql/src/
# This means rel_path should match pattern */ql/src/*.ql or *.qls
# We'll check if 'ql/src/' is in the rel_path and it's not at the start
if os.sep + 'ql' + os.sep + 'src' + os.sep in rel_path:
if filename.endswith('.ql'):
ql_files.append(rel_path)
elif filename.endswith('.qls'):
qls_files.append(rel_path)
return ql_files, qls_files
if __name__ == "__main__":
root = os.path.dirname(os.path.abspath(__file__))
ql_files, qls_files = get_ql_and_qls_files(root)
print("Found", len(ql_files), ".ql files")
print("Found", len(qls_files), ".qls files")
metadata_results = {}
PRIMARY_KEY = "id"
KEYS_TO_SPLIT = "tags", "alternate-ids", "metricAggregate"
RENAME_KEYS = {
"tags:": "tags",
}
KEY_DEFAULT_VALUE_IF_EMPTY = {
"deprecated": "Deprecated",
}
# First, process all .ql files for metadata
for rel_path in ql_files:
try:
result = subprocess.run([
'codeql', 'resolve', 'metadata', rel_path, '--format=json'
], capture_output=True, text=True, check=True)
metadata = json.loads(result.stdout)
for key in RENAME_KEYS:
if key in metadata:
new_key = RENAME_KEYS[key]
metadata[new_key] = metadata.pop(key)
for key in KEYS_TO_SPLIT:
if key in metadata and isinstance(metadata[key], str):
metadata[key] = [item.strip() for item in metadata[key].split() if item.strip()]
for key, default_value in KEY_DEFAULT_VALUE_IF_EMPTY.items():
if key in metadata and not metadata[key]:
metadata[key] = default_value
metadata_results[rel_path] = metadata
except subprocess.CalledProcessError as e:
print(f"Error processing {rel_path}: {e}")
# Now, for each .qls file, resolve queries and update metadata with packs
for qls_rel_path in qls_files:
try:
result = subprocess.run([
'codeql', 'resolve', 'queries', qls_rel_path, '--format=json'
], capture_output=True, text=True, check=True)
queries = json.loads(result.stdout)
qls_stem = os.path.splitext(os.path.basename(qls_rel_path))[0]
for query_path in queries:
# construct relative path from the absolute query_path
query_rel_path = os.path.relpath(query_path, root)
if query_rel_path in metadata_results:
meta = metadata_results[query_rel_path]
if 'packs' not in meta or not isinstance(meta['packs'], list):
meta['packs'] = []
if qls_stem not in meta['packs']:
meta['packs'].append(qls_stem)
except subprocess.CalledProcessError as e:
print(f"Error resolving queries for {qls_rel_path}: {e}")
# Save to a JSON file
with open('ql_metadata.json', 'w', encoding='utf-8') as f:
json.dump(metadata_results, f, indent=2)
# Save to a CSV file
rows = []
for path, metadata in metadata_results.items():
metadata["path"] = Path(path).as_posix()
rows.append(metadata)
# Collect all possible keys for CSV header
all_keys_set = set()
for row in rows:
all_keys_set.update(row.keys())
all_keys = sorted(all_keys_set)
# Ensure the primary key is always first
if PRIMARY_KEY in all_keys:
all_keys.remove(PRIMARY_KEY)
all_keys.insert(0, PRIMARY_KEY)
for row in rows:
# fill empty values for missing keys
for key in all_keys:
if key not in row:
row[key] = ''
# replace list values with comma-separated strings
for key in all_keys:
if isinstance(row[key], list):
row[key] = ', '.join(row[key])
# Add a "language" key inferred from the id
row['language'] = row["id"].split('/')[0].strip()
# now write with csv.DictWriter
with open('ql_metadata.csv', 'w', newline='', encoding='utf-8') as f:
cw = csv.DictWriter(f, fieldnames=all_keys)
cw.writeheader()
cw.writerows(rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment