Sam Gross colesbury

## gist:cbd2868f994fceab106307593ff9d775
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 21b122ae0f..37a8b06987 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -439,6 +439,7 @@ PYTHON_OBJS=	\
 		Python/modsupport.o \
 		Python/mysnprintf.o \
 		Python/mystrtoul.o \
+		Python/object_stack.o \
 		Python/optimizer.o \

## test_json.py
import time
import json
import math
import subprocess

LOOPS = 1000

def main():
    times = []
    for i in range(1):

## gist:ca7e67d128cfcc93b08161c5a5606a88
185.2%    logging_silent
122.9%    deltablue
107.0%    richards
 89.2%    unpack_sequence
 87.2%    go
 71.8%    fannkuch
 68.1%    scimark_sor
 61.5%    unpickle_pure_python
 60.8%    pyflate
 60.4%    logging_simple

## dataraces.log
==================
WARNING: ThreadSanitizer: data race (pid=48373)
  Read of size 8 at 0x7b5000000080 by main thread:
    #0 memcpy <null> (python+0x449377)
    #1 _Py_qsbr_register /private/home/sgross/scratch/nogil/Python/qsbr.c:192:5 (python+0x68b550)
    #2 new_threadstate /private/home/sgross/scratch/nogil/Python/pystate.c:940:20 (python+0x67fc02)
    #3 _PyThreadState_Prealloc /private/home/sgross/scratch/nogil/Python/pystate.c:986:12 (python+0x67fdc7)
    #4 thread_PyThread_start_new_thread /private/home/sgross/scratch/nogil/./Modules/_threadmodule.c:1338:29 (python+0x71ba6e)
    #5 cfunction_call /private/home/sgross/scratch/nogil/Objects/methodobject.c:471:18 (python+0x791ac9)
    #6 _PyObject_MakeTpCall /private/home/sgross/scratch/nogil/Objects/call.c:189:18 (python+0x4e0d46)

## numpy_crash.py
import threading
from queue import Queue
import numpy as np


def thread1(queue):
    queue.get().fill(5)

def thread2(queue):
    queue.get().resize((1, 1))

## gist:7976640c82af042fd34eeb5e21322652
INFO:root:World size is : 8
INFO:root:Running without GIL
INFO:root:Set start method of multiprocessing to spawn
INFO:root:Using the config
{"seed": 0, "data": {"dataset": "iamdb", "data_path": "/datasets01/iamdb/060820/", "img_height": 64}, "criterion_type": "transducer", "criterion": {"blank": true, "allow_repeats": false, "ngram": 0}, "model_type": "tds2d", "model": {"depth": 4, "tds_groups": [{"channels": 4, "num_blocks": 3, "stride": [2, 2]}, {"channels": 16, "num_blocks": 3, "stride": [2, 2]}, {"channels": 32, "num_blocks": 3, "stride": [2, 1]}, {"channels": 64, "num_blocks": 3, "stride": [2, 1]}], "kernel_size": [5, 7], "dropout": 0.1}, "optim": {"batch_size": 32, "epochs": 400, "learning_rate": 0.1, "crit_learning_rate": 0.1, "step_size": 100, "max_grad_norm": 5}}
INFO:root:Loading dataset ...
INFO:root:Loading model ...
INFO:root:Training tds2d model with 2,644,859 parameters.
INFO:root:Starting training ...
INFO:root:Epoch 1 started.

## ddptest.py
import random
import os
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP

## numpy_test.py
import gc
from time import sleep
import numpy as np

print('allocating memory')
pile=[]
for i in range(1500):
    for x in range(4):
        pile.append(np.ones((256, 64), dtype=np.float32))
        pile.append(np.ones((256, 64), dtype=np.float32))

## alloc.cpp
// Compile with g++ -O3 alloc.cpp -lnuma
#include <vector>
#include <memory>
#include <iostream>
#include <unistd.h>
#include <assert.h>

#include <numa.h>
#include <numaif.h>

## alloc.cpp
#include <vector>
#include <memory>
#include <iostream>
#include <unistd.h>
#include <assert.h>

#include <numa.h>
#include <numaif.h>

int USE_MBIND = 0;
	diff --git a/Makefile.pre.in b/Makefile.pre.in
	index 21b122ae0f..37a8b06987 100644
	--- a/Makefile.pre.in
	+++ b/Makefile.pre.in
	@@ -439,6 +439,7 @@ PYTHON_OBJS= \
	Python/modsupport.o \
	Python/mysnprintf.o \
	Python/mystrtoul.o \
	+ Python/object_stack.o \
	Python/optimizer.o \
	import time
	import json
	import math
	import subprocess

	LOOPS = 1000

	def main():
	times = []
	for i in range(1):
	185.2% logging_silent
	122.9% deltablue
	107.0% richards
	89.2% unpack_sequence
	87.2% go
	71.8% fannkuch
	68.1% scimark_sor
	61.5% unpickle_pure_python
	60.8% pyflate
	60.4% logging_simple
	==================
	WARNING: ThreadSanitizer: data race (pid=48373)
	Read of size 8 at 0x7b5000000080 by main thread:
	#0 memcpy <null> (python+0x449377)
	#1 _Py_qsbr_register /private/home/sgross/scratch/nogil/Python/qsbr.c:192:5 (python+0x68b550)
	#2 new_threadstate /private/home/sgross/scratch/nogil/Python/pystate.c:940:20 (python+0x67fc02)
	#3 _PyThreadState_Prealloc /private/home/sgross/scratch/nogil/Python/pystate.c:986:12 (python+0x67fdc7)
	#4 thread_PyThread_start_new_thread /private/home/sgross/scratch/nogil/./Modules/_threadmodule.c:1338:29 (python+0x71ba6e)
	#5 cfunction_call /private/home/sgross/scratch/nogil/Objects/methodobject.c:471:18 (python+0x791ac9)
	#6 _PyObject_MakeTpCall /private/home/sgross/scratch/nogil/Objects/call.c:189:18 (python+0x4e0d46)
	import threading
	from queue import Queue
	import numpy as np


	def thread1(queue):
	queue.get().fill(5)

	def thread2(queue):
	queue.get().resize((1, 1))
	INFO:root:World size is : 8
	INFO:root:Running without GIL
	INFO:root:Set start method of multiprocessing to spawn
	INFO:root:Using the config
	{"seed": 0, "data": {"dataset": "iamdb", "data_path": "/datasets01/iamdb/060820/", "img_height": 64}, "criterion_type": "transducer", "criterion": {"blank": true, "allow_repeats": false, "ngram": 0}, "model_type": "tds2d", "model": {"depth": 4, "tds_groups": [{"channels": 4, "num_blocks": 3, "stride": [2, 2]}, {"channels": 16, "num_blocks": 3, "stride": [2, 2]}, {"channels": 32, "num_blocks": 3, "stride": [2, 1]}, {"channels": 64, "num_blocks": 3, "stride": [2, 1]}], "kernel_size": [5, 7], "dropout": 0.1}, "optim": {"batch_size": 32, "epochs": 400, "learning_rate": 0.1, "crit_learning_rate": 0.1, "step_size": 100, "max_grad_norm": 5}}
	INFO:root:Loading dataset ...
	INFO:root:Loading model ...
	INFO:root:Training tds2d model with 2,644,859 parameters.
	INFO:root:Starting training ...
	INFO:root:Epoch 1 started.
	import random
	import os
	import tempfile
	import torch
	import torch.distributed as dist
	import torch.nn as nn
	import torch.optim as optim
	import torch.multiprocessing as mp

	from torch.nn.parallel import DistributedDataParallel as DDP
	import gc
	from time import sleep
	import numpy as np

	print('allocating memory')
	pile=[]
	for i in range(1500):
	for x in range(4):
	pile.append(np.ones((256, 64), dtype=np.float32))
	pile.append(np.ones((256, 64), dtype=np.float32))
	// Compile with g++ -O3 alloc.cpp -lnuma
	#include <vector>
	#include <memory>
	#include <iostream>
	#include <unistd.h>
	#include <assert.h>

	#include <numa.h>
	#include <numaif.h>