Created
September 20, 2012 06:04
-
-
Save neubig/3754203 to your computer and use it in GitHub Desktop.
A python program to find tweets from a tab-separated file with a regular expression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import re | |
import datetime | |
pattern = ur'電力.*供給' | |
def output(fo, tweet_id, user_id, timestamp, body): | |
fo.write('%s\t%s\t%s\t%s\n' % (tweet_id, user_id, timestamp, body.replace('\t', '\\t'))) | |
def process(src, dst): | |
fo = open(dst, 'w') | |
for line in open(src): | |
line = line.strip() | |
(tweet_id, user_id, timestamp, body) = line.split('\t') | |
if re.search(pattern, unicode(body, "utf-8")): | |
output(fo, tweet_id, user_id, timestamp, body) | |
if __name__ == '__main__': | |
for src in sys.argv[1:]: | |
dst = '%s.%s.txt' % (src, datetime.datetime.now()) | |
print "finding %s in %s and printing to %s" % (pattern, src, dst) | |
process(src, dst) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment