Last active
July 10, 2025 11:01
-
-
Save mrigankpawagi/6fded73b14c3ac88430fb8415c51661a to your computer and use it in GitHub Desktop.
Script to extract information about GitHub's CodeQL queries. Clone github.com/github/codeql and run the script from the root of the repository. It outputs a JSON file called ql_metadata.json and a CSV file called ql_metadata.csv. You need to have the CodeQL CLI installed on your system.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import subprocess | |
| import json | |
| import csv | |
| from pathlib import Path | |
| def get_ql_and_qls_files(root_dir): | |
| ql_files = [] | |
| qls_files = [] | |
| for dirpath, _, filenames in os.walk(root_dir): | |
| for filename in filenames: | |
| rel_path = os.path.relpath(os.path.join(dirpath, filename), root_dir) | |
| # Only include files in <something>/ql/src/ | |
| # This means rel_path should match pattern */ql/src/*.ql or *.qls | |
| # We'll check if 'ql/src/' is in the rel_path and it's not at the start | |
| if os.sep + 'ql' + os.sep + 'src' + os.sep in rel_path: | |
| if filename.endswith('.ql'): | |
| ql_files.append(rel_path) | |
| elif filename.endswith('.qls'): | |
| qls_files.append(rel_path) | |
| return ql_files, qls_files | |
| if __name__ == "__main__": | |
| root = os.path.dirname(os.path.abspath(__file__)) | |
| ql_files, qls_files = get_ql_and_qls_files(root) | |
| print("Found", len(ql_files), ".ql files") | |
| print("Found", len(qls_files), ".qls files") | |
| metadata_results = {} | |
| PRIMARY_KEY = "id" | |
| KEYS_TO_SPLIT = "tags", "alternate-ids", "metricAggregate" | |
| RENAME_KEYS = { | |
| "tags:": "tags", | |
| } | |
| KEY_DEFAULT_VALUE_IF_EMPTY = { | |
| "deprecated": "Deprecated", | |
| } | |
| # First, process all .ql files for metadata | |
| for rel_path in ql_files: | |
| try: | |
| result = subprocess.run([ | |
| 'codeql', 'resolve', 'metadata', rel_path, '--format=json' | |
| ], capture_output=True, text=True, check=True) | |
| metadata = json.loads(result.stdout) | |
| for key in RENAME_KEYS: | |
| if key in metadata: | |
| new_key = RENAME_KEYS[key] | |
| metadata[new_key] = metadata.pop(key) | |
| for key in KEYS_TO_SPLIT: | |
| if key in metadata and isinstance(metadata[key], str): | |
| metadata[key] = [item.strip() for item in metadata[key].split() if item.strip()] | |
| for key, default_value in KEY_DEFAULT_VALUE_IF_EMPTY.items(): | |
| if key in metadata and not metadata[key]: | |
| metadata[key] = default_value | |
| metadata_results[rel_path] = metadata | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error processing {rel_path}: {e}") | |
| # Now, for each .qls file, resolve queries and update metadata with packs | |
| for qls_rel_path in qls_files: | |
| try: | |
| result = subprocess.run([ | |
| 'codeql', 'resolve', 'queries', qls_rel_path, '--format=json' | |
| ], capture_output=True, text=True, check=True) | |
| queries = json.loads(result.stdout) | |
| qls_stem = os.path.splitext(os.path.basename(qls_rel_path))[0] | |
| for query_path in queries: | |
| # construct relative path from the absolute query_path | |
| query_rel_path = os.path.relpath(query_path, root) | |
| if query_rel_path in metadata_results: | |
| meta = metadata_results[query_rel_path] | |
| if 'packs' not in meta or not isinstance(meta['packs'], list): | |
| meta['packs'] = [] | |
| if qls_stem not in meta['packs']: | |
| meta['packs'].append(qls_stem) | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error resolving queries for {qls_rel_path}: {e}") | |
| # Save to a JSON file | |
| with open('ql_metadata.json', 'w', encoding='utf-8') as f: | |
| json.dump(metadata_results, f, indent=2) | |
| # Save to a CSV file | |
| rows = [] | |
| for path, metadata in metadata_results.items(): | |
| metadata["path"] = Path(path).as_posix() | |
| rows.append(metadata) | |
| # Collect all possible keys for CSV header | |
| all_keys_set = set() | |
| for row in rows: | |
| all_keys_set.update(row.keys()) | |
| all_keys = sorted(all_keys_set) | |
| # Ensure the primary key is always first | |
| if PRIMARY_KEY in all_keys: | |
| all_keys.remove(PRIMARY_KEY) | |
| all_keys.insert(0, PRIMARY_KEY) | |
| for row in rows: | |
| # fill empty values for missing keys | |
| for key in all_keys: | |
| if key not in row: | |
| row[key] = '' | |
| # replace list values with comma-separated strings | |
| for key in all_keys: | |
| if isinstance(row[key], list): | |
| row[key] = ', '.join(row[key]) | |
| # Add a "language" key inferred from the id | |
| row['language'] = row["id"].split('/')[0].strip() | |
| # now write with csv.DictWriter | |
| with open('ql_metadata.csv', 'w', newline='', encoding='utf-8') as f: | |
| cw = csv.DictWriter(f, fieldnames=all_keys) | |
| cw.writeheader() | |
| cw.writerows(rows) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment