Skip to content

Instantly share code, notes, and snippets.

@brainstormot
Last active March 19, 2019 02:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brainstormot/98700e9211412de6232c0c81bc80af39 to your computer and use it in GitHub Desktop.
Save brainstormot/98700e9211412de6232c0c81bc80af39 to your computer and use it in GitHub Desktop.
import os
import csv
searchKeywords = ["P2P","금융"]
def main():
for dirname, dirnames, filenames in os.walk('.'):
for subdirname in dirnames:
if subdirname == "news":
print("in " + os.path.join(dirname, subdirname))
filter(os.path.join(dirname, subdirname))
def filter(newsDirname):
for dirname, dirnames, filenames in os.walk(newsDirname):
csvfilename = dirname.split('/')[-1]
if os.path.exists(csvfilename+'.csv'):
print(csvfilename+'.csv already exist. overiding file...')
else:
print(csvfilename+'.csv have just been created.')
csvFile = open(csvfilename+'.csv', 'w', encoding='utf-8', newline='')
wr = csv.writer(csvFile)
for filename in filenames:
if filename.endswith("index"):
print(os.path.join(dirname, filename))
f = open(os.path.join(dirname, filename), 'r', encoding='UTF8')
lines = f.readlines()
numOfSearched = 0
for idx,line in enumerate(lines):
if len(line) < 30:
continue
(header, title, content) = line.split('|||||||')
if all([elem in content for elem in searchKeywords]):
numOfSearched += 1
wr.writerow(parseMetadata(header,title,content))
f.close()
print(numOfSearched,"articles searched")
csvFile.close()
def parseMetadata(metadata,title,content):
metadataList = metadata.split()[0].split('/')
# return [metadataList[1]+"-"+metadataList[2]+"-"+metadataList[3],metadataList[0],content.strip()]
return [metadataList[1]+"-"+metadataList[2]+"-"+metadataList[3],metadataList[0],metadataList[4],title.strip(),content.strip()]
if __name__ == "__main__":
main()
print("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment