Skip to content

Instantly share code, notes, and snippets.

@savanovich
Created March 26, 2018 11:39
Show Gist options
  • Save savanovich/213d7299e54d75a505be04deb149bed5 to your computer and use it in GitHub Desktop.
Save savanovich/213d7299e54d75a505be04deb149bed5 to your computer and use it in GitHub Desktop.
Sparse csr scipy matrix
row, col, data = [], [], []
row_idx = -1
y = []
feature_counter = Counter()
for file_path in data_path.glob(DATA_FILES_PATTERN):
print(file_path)
with open(Path(file_path).expanduser(), 'r') as data_file:
class_idx = BLOCK_CLASSES[file_path.stem]
for line in data_file:
block = json.loads(line)
row_idx += 1
for k, v in block.items():
if k not in ('0', '1'): # (block_uuid, doc_uuid)
row.append(row_idx)
col.append(int(k))
data.append(v)
feature_counter[int(k)] += v
y.append(class_idx)
X = csr_matrix((data, (row, col)), shape=(row_idx + 1, extractor.max_feature_index))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment