Skip to content

Instantly share code, notes, and snippets.

@mr-eyes
Last active May 16, 2019 03:05
Show Gist options
  • Save mr-eyes/02b3113102d45ba9bf3d35ae4bc20834 to your computer and use it in GitHub Desktop.
Save mr-eyes/02b3113102d45ba9bf3d35ae4bc20834 to your computer and use it in GitHub Desktop.
Merge multiple <key:value> tables files.
import sys
if len(sys.argv) < 3:
sys.exit("run: python filter_table.py <file> <threshold>")
else:
file_path = sys.argv[1]
threshold = int(sys.argv[2])
with open(file_path, 'r') as f:
print(next(f).strip())
for line in f:
values = line.strip().split("\t")[2:]
values = list(map(int, values))
_max = max(values)
if _max > threshold:
print(line.strip())
import sqlite3 as sql
import glob
import sys
import os
class Merger:
DELIMITER = " "
def __init__(self, wildcart):
"""
create a database connection to a database that resides in the memory
"""
if len(wildcart) == 1:
if not os.path.isdir(wildcart[0]):
print(f"'{wildcart[0]}' directory is not found", file = sys.stderr)
sys.exit(1)
self.files_paths = sorted(glob.glob(wildcart[0] + "/*"))
else:
self.files_paths = wildcart
try:
self.conn = sql.connect(':memory:')
except sql.Error as e:
print(e)
exit()
self.conn.execute('''CREATE TABLE IF NOT EXISTS combined
(ID INTEGER PRIMARY KEY,
key TEXT DEFAULT "" NOT NULL UNIQUE);''')
for file in self.files_paths:
file = os.path.basename(file)
self.conn.execute(f"ALTER TABLE combined ADD COLUMN '{file}' INT DEFAULT 0 NOT NULL;")
self.conn.commit()
def parse_files(self):
for file in self.files_paths:
with open(file, 'r') as f:
for line in f:
line = line.strip().split(self.DELIMITER)
where = line[0]
value = line[1]
key = os.path.basename(file)
self.conn.execute(f"INSERT OR IGNORE INTO combined (key) values ('{where}');")
self.conn.execute(f"UPDATE combined SET '{key}'={value} WHERE key='{where}'")
self.conn.commit()
def export_to_tsv(self):
curs = self.conn.execute('select * from combined')
col_names = list(map(lambda x: x[0], curs.description))
rows = curs.fetchall() # collect the data
print("\t".join(col_names))
for row in rows:
row = list(map(str, row))
print("\t".join(row))
def main(wildcart):
parser = Merger(wildcart)
parser.DELIMITER = " " # Optional, delimiter is " " by default.
parser.parse_files()
parser.export_to_tsv()
if __name__ == "__main__":
if len(sys.argv) < 2 or sys.argv[1] in ["-h", "--help"]:
print(f"Examples:", file=sys.stderr)
print(f"python merge_tables.py <dir_path>|<wildcart_expression>", file=sys.stderr)
print(f"python merge_tables.py files", file=sys.stderr)
print(f"python merge_tables.py files/*txt", file=sys.stderr)
sys.exit(1)
elif len(sys.argv) == 2:
wildcart = [sys.argv[1]]
elif len(sys.argv) > 2:
wildcart = sys.argv[1:]
main(wildcart)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment