JohnLonginotto/gist:ba78c88372be1fef39347e2ad2de80d1

## gistfile1.py

import sys
reads = sys.argv[1]
index = sys.argv[2]

# Get all read ids:
all_readIDs = set() 	# Using a set because it will be quicker to find things in later.
			# A list would require python to check every item in the list.
			# A set can be thought of as an always-sorted list (elements are added in their sorted order) with no duplication of elements.
			# The sorted order is based on the hash() value of the items however, which is essentially a random number.
			# A further optimization of this would to not even bother storing the values in our hash-sorted list of values, but just
			# the hashes themselves, which a friend just told me is called a "Bloom filter" and I don't get a prize for thinking it up :P

with open(reads, "r") as f:
	# The following is the simplest code to do what you want to do:
	for line in f:
		if line.startswith("@HWI"): all_readIDs.add(line)

	# However a quicker method would to just be to add every 4th line to our set once we've found the first line that starts with @HWI.
	# This is because python wouldn't have to check for @HWI on every line. Furthermore, for some FASTQ files the third row, usually "+",
	# is actually the same as the first row, the id. Trying to add the same value to a set twice is not a problem, but it takes time to check that it's
	# already in there, and nothing will change, so it's better to just read every 4th line.

if not all_readIDs: print "List is Empty" # Nice check. Like it.
else:
	# but its always a good idea to look at a few items in the set to, just to be sure:
	for idx,item in enumerate(all_readIDs):
		print item
		if idx == 10: break

with open(index, "r") as f2, open("filtered_index.fastq", "w") as f3: # using with in this way saves on some indentation. I also swapped f2 and f3 around.
	# Here's one way to do the reading every 4th line, as mentioned above:
	skip = 0
	for line in f2:
		if skip == 0:
			if line in all_readIDs: f3.write(line)
			skip = 3
		else:
			skip = skip - 1  # (can also be written as "skip -= 1")

print "All done!"

	import sys
	reads = sys.argv[1]
	index = sys.argv[2]

	# Get all read ids:
	all_readIDs = set() # Using a set because it will be quicker to find things in later.
	# A list would require python to check every item in the list.
	# A set can be thought of as an always-sorted list (elements are added in their sorted order) with no duplication of elements.
	# The sorted order is based on the hash() value of the items however, which is essentially a random number.
	# A further optimization of this would to not even bother storing the values in our hash-sorted list of values, but just
	# the hashes themselves, which a friend just told me is called a "Bloom filter" and I don't get a prize for thinking it up :P

	with open(reads, "r") as f:
	# The following is the simplest code to do what you want to do:
	for line in f:
	if line.startswith("@HWI"): all_readIDs.add(line)

	# However a quicker method would to just be to add every 4th line to our set once we've found the first line that starts with @HWI.
	# This is because python wouldn't have to check for @HWI on every line. Furthermore, for some FASTQ files the third row, usually "+",
	# is actually the same as the first row, the id. Trying to add the same value to a set twice is not a problem, but it takes time to check that it's
	# already in there, and nothing will change, so it's better to just read every 4th line.

	if not all_readIDs: print "List is Empty" # Nice check. Like it.
	else:
	# but its always a good idea to look at a few items in the set to, just to be sure:
	for idx,item in enumerate(all_readIDs):
	print item
	if idx == 10: break

	with open(index, "r") as f2, open("filtered_index.fastq", "w") as f3: # using with in this way saves on some indentation. I also swapped f2 and f3 around.
	# Here's one way to do the reading every 4th line, as mentioned above:
	skip = 0
	for line in f2:
	if skip == 0:
	if line in all_readIDs: f3.write(line)
	skip = 3
	else:
	skip = skip - 1 # (can also be written as "skip -= 1")

	print "All done!"