Skip to content

Instantly share code, notes, and snippets.

@indra-uolles
Last active June 26, 2022 09:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save indra-uolles/ec64b64d531acbe517fbb4df718ef894 to your computer and use it in GitHub Desktop.
Save indra-uolles/ec64b64d531acbe517fbb4df718ef894 to your computer and use it in GitHub Desktop.
Convert Scrapy csv to Retinanet csv
import re
def extract_value(fparam_name, fline):
str_regexp = f'\"\"{fparam_name}\"\":\s[0-9\.]+'
split_expr = f'""{fparam_name}"":'
param_with_prefix = re.search(str_regexp, fline).group(0)
return param_with_prefix.split(split_expr)[1]
def extract_label(fline):
str_regexp = '\"\"rectanglelabels"\":\s\[\"\"[\w]+\"\"\]'
try:
expr_1 = re.search(str_regexp, fline).group(0)
str_regexp2 = '(\[\"\")([\w]+)(\"\"\])'
return re.search(str_regexp2, expr_1).group(2)
except:
try:
return line.split(',')[2]
except:
print(f'An exception occurred, couldnt extract label from line {fline}')
def extract_img_url(line):
img_url_with_prefix = line.split(',')[0]
len_img_url = len(img_url_with_prefix)
try:
prefix = re.search('/data/upload/(\d)+/(\w)+-', img_url_with_prefix).group(0)
len_prefix = len(prefix)
img_url = line[len_prefix:len_img_url]
return img_url
except:
print(f'An exception occurred, couldnt extract img_url from line {line}')
def extract_new_line(line):
img_url = extract_img_url(line)
label = extract_label(line)
if label == 'Nothing':
return f'{img_url},,,,,'
else:
x = float(extract_value('x', line))
y = float(extract_value('y', line))
width = float(extract_value('width', line))
height = float(extract_value('height', line))
original_width = float(extract_value('original_width', line))
original_height = float(extract_value('original_height', line))
pixel_x = x / 100.0 * original_width
pixel_y = y / 100.0 * original_height
pixel_width = width / 100.0 * original_width
pixel_height = height / 100.0 * original_height
x1 = pixel_x
y1 = pixel_y
x2 = pixel_x + pixel_width
y2 = pixel_y + pixel_height
return f'{img_url},{x1},{y1},{x2},{y2},{label}'
lines = []
with open('project.csv') as f:
lines = f.readlines()
f_out = open("project_out.csv", "w")
count = 0
for line in lines:
if count > 0:
f_out.write(extract_new_line(line) + '\n')
count = count + 1
f.close()
f_out.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment