Skip to content

Instantly share code, notes, and snippets.

@chengjun
Created June 23, 2015 10:20
Show Gist options
  • Save chengjun/a307d44f2d603575db34 to your computer and use it in GitHub Desktop.
Save chengjun/a307d44f2d603575db34 to your computer and use it in GitHub Desktop.
clean wise news data
with open("F:/百度云同步盘/Computational Communication/Data/占中数据20150328/zz-hk-2013.1-2013.3.rtf") as f:
news = f.readlines()
def stringclean(s):
s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b\cf6 ', '')
s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b0\cf0 ', '')
s = s.replace('\par', '').replace('\n', '')
return s
def readblocks(news):
copy = False
n = 0
block = []
for i in news[0:1000]:
if "~~~~~~~~~~~~~~~~~~~~~~~~~~ #" in i:
copy = True
elif "文章编号:" in i:
id = stringclean(i).replace('文章编号: ', '')
source, date = stringclean(block[0]).split('|')
info = stringclean(block[1]).split('|')
section = info[0]
att = info[1]
if len(info) ==3:
title1 = info[2]
author = ''
elif len(info) == 4:
title1 = info[2]
author = info[3]
else:
title1 = ''
author = ''
title2 = stringclean(block[3])
body = [j for j in block[6:] if j != '\n']
body = ' '.join(body)
body = stringclean(body)
body = '"' + body + '"'
print id, source, date, section, att, title1, title2, author#, body
block = []
n += 1
copy = False
elif copy:
block.append(i)
readblocks(news)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment