Skip to content

Instantly share code, notes, and snippets.

@link89
Created November 24, 2023 09:30
Show Gist options
  • Save link89/2bcc2c4c000141042a6f72eea4aaa347 to your computer and use it in GitHub Desktop.
Save link89/2bcc2c4c000141042a6f72eea4aaa347 to your computer and use it in GitHub Desktop.
A script to sample text frame data (each frame has fixed lines)
import numpy as np
def sample_data_frame(path: str, frame_lines: int, nsample: int, out_path: str,
total_lines: int = -1, encoding='utf-8',
write_buffer_size=10 * 1024 * 1024,
):
"""
Sample data frame from file randomly
A data file is composed of multiple frames, each frame has fixed number of lines.
This method will sample frames from file randomly and write to another file.
:param path: path to data file
:param frame_lines: number of lines in each frame
:param nsample: number of frames to sample
:param out_path: path to write sampled data
:param total_lines: total number of lines in data file, if -1, will count by itself
:param encoding: encoding of data file
:param write_buffer_size: buffer size for writing data
"""
if total_lines == -1:
with open(path, encoding=encoding) as f:
total_lines = sum(1 for _ in f)
if total_lines % frame_lines != 0:
raise ValueError(f'Number of lines {total_lines} is not multiple of frame lines {frame_lines}')
nframes = total_lines // frame_lines
if nsample > nframes:
raise ValueError(f'Number of sample {nsample} is larger than number of frames {nframes}')
frames = np.random.choice(nframes, nsample, replace=False)
frames.sort()
# read and write data
with open(path, encoding=encoding) as f, \
open(out_path, 'w', encoding=encoding, buffering=write_buffer_size) as fw:
for i, line in enumerate(f):
if i // frame_lines in frames:
fw.write(line)
if __name__ == '__main__':
import fire
fire.Fire(sample_data_frame)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment