mylamour/musicbrainzUrlSchemaParseCsvToJSON.py

## musicbrainzUrlSchemaParseCsvToJSON.py
import csv
import json
import os
from urlparse import urlparse

csvfile = open('url.csv', 'r')
jsonfile = open('test.json', 'w')

fieldnames = ("@id","sourceUrl")
reader = csv.DictReader( csvfile, fieldnames)
for row in reader:
   # print 'row.sourceUrl is ' + row['sourceUrl']
    url = urlparse(row['sourceUrl'])
    if len(url.query) != 0:
        row.update({"idAtSource":url.query})
    else:
        # there also antoher problem, if the url contain the '/', the basename will be none , but it's shoudn't be that
        row.update({"idAtSource":os.path.basename(url.path)})

# URL domain was complex can't be parse, but this way was amusing to resolve it.
    domain = url.netloc.split('.')
    if len(domain) == 2:
        row.update({'sourceName':domain[0]})
    else:
        row.update({'sourceName':domain[1]})

#     elif len(domain) == 3 :
#         row.update({'sourceName':domain[1]})
#     elif len(domain) == 4 :
#         row.update({'sourceName':domain[1]})
#     else:
#         row.update({'sourceName':domain[1]})

    row.update({"sourceDomain":url.netloc})
    row.update({"@type":"ExternalLink"})
    json.dump(row,jsonfile)
    jsonfile.write('\n')
	import csv
	import json
	import os
	from urlparse import urlparse

	csvfile = open('url.csv', 'r')
	jsonfile = open('test.json', 'w')

	fieldnames = ("@id","sourceUrl")
	reader = csv.DictReader( csvfile, fieldnames)
	for row in reader:
	# print 'row.sourceUrl is ' + row['sourceUrl']
	url = urlparse(row['sourceUrl'])
	if len(url.query) != 0:
	row.update({"idAtSource":url.query})
	else:
	# there also antoher problem, if the url contain the '/', the basename will be none , but it's shoudn't be that
	row.update({"idAtSource":os.path.basename(url.path)})

	# URL domain was complex can't be parse, but this way was amusing to resolve it.
	domain = url.netloc.split('.')
	if len(domain) == 2:
	row.update({'sourceName':domain[0]})
	else:
	row.update({'sourceName':domain[1]})

	# elif len(domain) == 3 :
	# row.update({'sourceName':domain[1]})
	# elif len(domain) == 4 :
	# row.update({'sourceName':domain[1]})
	# else:
	# row.update({'sourceName':domain[1]})

	row.update({"sourceDomain":url.netloc})
	row.update({"@type":"ExternalLink"})
	json.dump(row,jsonfile)
	jsonfile.write('\n')