Created
November 24, 2023 09:30
-
-
Save link89/2bcc2c4c000141042a6f72eea4aaa347 to your computer and use it in GitHub Desktop.
A script to sample text frame data (each frame has fixed lines)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def sample_data_frame(path: str, frame_lines: int, nsample: int, out_path: str, | |
total_lines: int = -1, encoding='utf-8', | |
write_buffer_size=10 * 1024 * 1024, | |
): | |
""" | |
Sample data frame from file randomly | |
A data file is composed of multiple frames, each frame has fixed number of lines. | |
This method will sample frames from file randomly and write to another file. | |
:param path: path to data file | |
:param frame_lines: number of lines in each frame | |
:param nsample: number of frames to sample | |
:param out_path: path to write sampled data | |
:param total_lines: total number of lines in data file, if -1, will count by itself | |
:param encoding: encoding of data file | |
:param write_buffer_size: buffer size for writing data | |
""" | |
if total_lines == -1: | |
with open(path, encoding=encoding) as f: | |
total_lines = sum(1 for _ in f) | |
if total_lines % frame_lines != 0: | |
raise ValueError(f'Number of lines {total_lines} is not multiple of frame lines {frame_lines}') | |
nframes = total_lines // frame_lines | |
if nsample > nframes: | |
raise ValueError(f'Number of sample {nsample} is larger than number of frames {nframes}') | |
frames = np.random.choice(nframes, nsample, replace=False) | |
frames.sort() | |
# read and write data | |
with open(path, encoding=encoding) as f, \ | |
open(out_path, 'w', encoding=encoding, buffering=write_buffer_size) as fw: | |
for i, line in enumerate(f): | |
if i // frame_lines in frames: | |
fw.write(line) | |
if __name__ == '__main__': | |
import fire | |
fire.Fire(sample_data_frame) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment