link89/sample_data_frame.py

## sample_data_frame.py
import numpy as np

def sample_data_frame(path: str, frame_lines: int, nsample: int, out_path: str,
                      total_lines: int = -1, encoding='utf-8',
                      write_buffer_size=10 * 1024 * 1024,
                      ):
    """
    Sample data frame from file randomly

    A data file is composed of multiple frames, each frame has fixed number of lines.
    This method will sample frames from file randomly and write to another file.

    :param path: path to data file
    :param frame_lines: number of lines in each frame
    :param nsample: number of frames to sample
    :param out_path: path to write sampled data
    :param total_lines: total number of lines in data file, if -1, will count by itself
    :param encoding: encoding of data file
    :param write_buffer_size: buffer size for writing data
    """

    if total_lines == -1:
        with open(path, encoding=encoding) as f:
            total_lines = sum(1 for _ in f)

    if total_lines % frame_lines != 0:
        raise ValueError(f'Number of lines {total_lines} is not multiple of frame lines {frame_lines}')

    nframes = total_lines // frame_lines
    if nsample > nframes:
        raise ValueError(f'Number of sample {nsample} is larger than number of frames {nframes}')

    frames = np.random.choice(nframes, nsample, replace=False)
    frames.sort()

    # read and write data
    with open(path, encoding=encoding) as f, \
         open(out_path, 'w', encoding=encoding, buffering=write_buffer_size) as fw:
        for i, line in enumerate(f):
            if i // frame_lines in frames:
                fw.write(line)


if __name__ == '__main__':
    import fire
    fire.Fire(sample_data_frame)
	import numpy as np

	def sample_data_frame(path: str, frame_lines: int, nsample: int, out_path: str,
	total_lines: int = -1, encoding='utf-8',
	write_buffer_size=10 * 1024 * 1024,
	):
	"""
	Sample data frame from file randomly

	A data file is composed of multiple frames, each frame has fixed number of lines.
	This method will sample frames from file randomly and write to another file.

	:param path: path to data file
	:param frame_lines: number of lines in each frame
	:param nsample: number of frames to sample
	:param out_path: path to write sampled data
	:param total_lines: total number of lines in data file, if -1, will count by itself
	:param encoding: encoding of data file
	:param write_buffer_size: buffer size for writing data
	"""

	if total_lines == -1:
	with open(path, encoding=encoding) as f:
	total_lines = sum(1 for _ in f)

	if total_lines % frame_lines != 0:
	raise ValueError(f'Number of lines {total_lines} is not multiple of frame lines {frame_lines}')

	nframes = total_lines // frame_lines
	if nsample > nframes:
	raise ValueError(f'Number of sample {nsample} is larger than number of frames {nframes}')

	frames = np.random.choice(nframes, nsample, replace=False)
	frames.sort()

	# read and write data
	with open(path, encoding=encoding) as f, \
	open(out_path, 'w', encoding=encoding, buffering=write_buffer_size) as fw:
	for i, line in enumerate(f):
	if i // frame_lines in frames:
	fw.write(line)


	if __name__ == '__main__':
	import fire
	fire.Fire(sample_data_frame)