yuchenlin/clean_conceptnet.py

## clean_conceptnet.py
```
wget https://s3.amazonaws.com/conceptnet/downloads/2017/edges/conceptnet-assertions-5.5.5.csv.gz
gunzip -k conceptnet-assertions-5.5.5.csv.gz
```

import json

def del_pos(s):
    """
    Deletes part-of-speech encoding from an entity string, if present.
    :param s: Entity string.
    :return: Entity string with part-of-speech encoding removed.
    """
    if s.endswith("/n") or s.endswith("/a") or s.endswith("/v") or s.endswith("/r"):
        s = s[:-2]
    return s


def filter():
    results = []
    with open("conceptnet-assertions-5.5.5.csv", encoding="utf8") as f:
        for line in f.readlines():
            ls = line.split('\t')
            if ls[2].startswith('/c/en/') and ls[3].startswith('/c/en/'):
                """
                Some preprocessing:
                    - Remove part-of-speech encoding.
                    - Split("/")[-1] to trim the "/c/en/" and just get the entity name, convert all to
                    - Lowercase for uniformity.
                """
                rel = ls[1].split("/")[-1].lower()
                head = del_pos(ls[2]).split("/")[-1].lower()
                tail = del_pos(ls[3]).split("/")[-1].lower()

                if rel not in ["partof", "hasa"]:
                    continue

                if not head.replace("_", "").replace("-", "").isalpha():
                    continue

                if not tail.replace("_", "").replace("-", "").isalpha():
                    continue

                data = json.loads(ls[4])
                results.append("\t".join([head, rel, tail, str(data["weight"])]))

    with open("cpnet_clean.txt", "w", encoding="utf8") as f:
        f.write("\n".join(results))


if __name__ == "__main__":
    filter()
	```
	wget https://s3.amazonaws.com/conceptnet/downloads/2017/edges/conceptnet-assertions-5.5.5.csv.gz
	gunzip -k conceptnet-assertions-5.5.5.csv.gz
	```

	import json

	def del_pos(s):
	"""
	Deletes part-of-speech encoding from an entity string, if present.
	:param s: Entity string.
	:return: Entity string with part-of-speech encoding removed.
	"""
	if s.endswith("/n") or s.endswith("/a") or s.endswith("/v") or s.endswith("/r"):
	s = s[:-2]
	return s


	def filter():
	results = []
	with open("conceptnet-assertions-5.5.5.csv", encoding="utf8") as f:
	for line in f.readlines():
	ls = line.split('\t')
	if ls[2].startswith('/c/en/') and ls[3].startswith('/c/en/'):
	"""
	Some preprocessing:
	- Remove part-of-speech encoding.
	- Split("/")[-1] to trim the "/c/en/" and just get the entity name, convert all to
	- Lowercase for uniformity.
	"""
	rel = ls[1].split("/")[-1].lower()
	head = del_pos(ls[2]).split("/")[-1].lower()
	tail = del_pos(ls[3]).split("/")[-1].lower()

	if rel not in ["partof", "hasa"]:
	continue

	if not head.replace("_", "").replace("-", "").isalpha():
	continue

	if not tail.replace("_", "").replace("-", "").isalpha():
	continue

	data = json.loads(ls[4])
	results.append("\t".join([head, rel, tail, str(data["weight"])]))

	with open("cpnet_clean.txt", "w", encoding="utf8") as f:
	f.write("\n".join(results))


	if __name__ == "__main__":
	filter()