Skip to content

Instantly share code, notes, and snippets.

@joonas-yoon
Last active September 1, 2021 02:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joonas-yoon/42ddd27c84d0dd938dab355adeafb67f to your computer and use it in GitHub Desktop.
Save joonas-yoon/42ddd27c84d0dd938dab355adeafb67f to your computer and use it in GitHub Desktop.
한글로만 이루어진 10GB짜리 파일 생성
import re
import random
import math
from datetime import datetime, timedelta
def print_progress_bar(time, percent, text, width):
length = int(width * percent / 100)
bar = '█' * length + '-' * (width - length)
prefix = time.strftime('[%H:%M:%S]')
percent_text = '{:.2f}'.format(percent)
print(f'\r{prefix} {percent_text}% |{bar}| {text} ', end='')
def human_byte_str(byte):
prefix = 0
while byte >= 1024:
byte /= 1024
prefix += 1
return '{:.2f} {}B'.format(byte, "KMGTPEZY"[prefix-1] if prefix > 0 else '')
def gen(db, goal_size):
size = 0
current = datetime.now()
pool = list(db)
pooln = len(db)
percent_prev = -1
while size < goal_size:
wc = random.randint(5, 15)
words = [pool[random.randint(0, pooln-1)] for _ in range(wc)]
sentence = ' '.join(words)
size += len(sentence) * 3 - (len(words) - 1) * 2 # all = kor-unicode(3byte) + space(1byte)
o.write(sentence + '\n')
now = datetime.now()
if (now - current).seconds >= 1:
current = now
percent = size / goal_size * 100
sz_str = '{}/{}'.format(human_byte_str(size), human_byte_str(goal_size))
print_progress_bar(now, percent, sz_str, 50)
print('')
with open('result2.txt', 'w', encoding='utf-8') as o:
db = set()
with open('source.txt', 'r', encoding='utf-8') as f:
while True:
s = f.readline()
if not s: break
regex = re.compile('[^ㄱ-ㅎ|ㅏ-ㅣ|가-힣\s]')
for word in regex.sub('', s.strip()).split():
db.add(word)
print('Start')
gen(db, 10 * (1024 ** 3)) # 10GB
print('Finish')
나무위키에서 가장 긴 문서 (629019글자)
https://namu.wiki/w/%EB%82%98%EB%B9%84%EC%97%90%20%EC%97%98%EB%A6%AC%20%ED%8A%B8%EB%A1%9C%EB%B9%84/%EC%9E%91%EC%A4%91%20%ED%96%89%EC%A0%81
@joonas-yoon
Copy link
Author

joonas-yoon commented Sep 1, 2021

$ python large_file_generator.py
Start
[11:40:53] 78.80% |███████████████████████████████████████-----------| 806.87 MB/1.00 GB    

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment