diff --git a/extract_features.py b/extract_features.py
index f186322a..febbd2fa 100644
--- a/extract_features.py
+++ b/extract_features.py
@@ -7,8 +7,8 @@
 import argparse, os, json
 import h5py
 import numpy as np
-from scipy.misc import imread, imresize
-
+# from scipy.misc import imread, imresize
+from cv2 import imread, resize as imresize
 import torch
 import torchvision
 
@@ -86,8 +86,8 @@ def main(args):
     i0 = 0
     cur_batch = []
     for i, (path, idx) in enumerate(input_paths):
-      img = imread(path, mode='RGB')
-      img = imresize(img, img_size, interp='bicubic')
+      img = imread(path)
+      img = imresize(img, img_size)
       img = img.transpose(2, 0, 1)[None]
       cur_batch.append(img)
       if len(cur_batch) == args.batch_size:
diff --git a/mac_cell.py b/mac_cell.py
index 2fc78a1f..28af5313 100644
--- a/mac_cell.py
+++ b/mac_cell.py
@@ -27,7 +27,7 @@
 3. The Write Unit integrates the retrieved information to the previous hidden memory state,
 given the value of the control state, to perform the current reasoning operation.
 '''
-class MACCell(tf.nn.rnn_cell.RNNCell):
+class MACCell(tf.compat.v1.nn.rnn_cell.RNNCell):
 
     '''Initialize the MAC cell. 
     (Note that in the current version the cell is stateful -- 
@@ -133,7 +133,7 @@ def output_size(self):
     def control(self, controlInput, inWords, outWords, questionLengths,
         control, contControl = None, name = "", reuse = None):
 
-        with tf.variable_scope("control" + name, reuse = reuse):
+        with tf.compat.v1.variable_scope("control" + name, reuse = reuse):
             dim = config.ctrlDim
 
             ## Step 1: compute "continuous" control state given previous control and question.
@@ -207,14 +207,14 @@ def control(self, controlInput, inWords, outWords, questionLengths,
     [batchSize, memDim]
     '''
     def read(self, knowledgeBase, memory, control, name = "", reuse = None):
-        with tf.variable_scope("read" + name, reuse = reuse):
+        with tf.compat.v1.variable_scope("read" + name, reuse = reuse):
             dim = config.memDim 
 
             ## memory dropout
             if config.memoryVariationalDropout:
                 memory = ops.applyVarDpMask(memory, self.memDpMask, self.dropouts["memory"])
             else:
-                memory = tf.nn.dropout(memory, self.dropouts["memory"])
+                memory = tf.compat.v1.nn.dropout(memory, self.dropouts["memory"])
 
             ## Step 1: knowledge base / memory interactions 
             # parameters for knowledge base and memory projection 
@@ -303,7 +303,7 @@ def read(self, knowledgeBase, memory, control, name = "", reuse = None):
     [batchSize, memDim]
     '''
     def write(self, memory, info, control, contControl = None, name = "", reuse = None):
-        with tf.variable_scope("write" + name, reuse = reuse):
+        with tf.compat.v1.variable_scope("write" + name, reuse = reuse):
 
             # optionally project info
             if config.writeInfoProj:
@@ -374,8 +374,8 @@ def write(self, memory, info, control, contControl = None, name = "", reuse = No
 
         return newMemory
 
-    def memAutoEnc(newMemory, info, control, name = "", reuse = None):
-        with tf.variable_scope("memAutoEnc" + name, reuse = reuse):
+    def memAutoEnc(self, newMemory, info, control, name = "", reuse = None):
+        with tf.compat.v1.variable_scope("memAutoEnc" + name, reuse = reuse):
             # inputs to auto encoder
             features = info if config.autoEncMemInputs == "INFO" else newMemory
             features = ops.linear(features, config.memDim, config.ctrlDim, 
@@ -419,7 +419,7 @@ def memAutoEnc(newMemory, info, control, name = "", reuse = None):
     '''
     def __call__(self, inputs, state, scope = None):
         scope = scope or type(self).__name__
-        with tf.variable_scope(scope, reuse = self.reuse): #  as tfscope
+        with tf.compat.v1.variable_scope(scope, reuse = self.reuse): #  as tfscope
             control = state.control
             memory = state.memory
 
@@ -460,7 +460,7 @@ def __call__(self, inputs, state, scope = None):
 
             if config.writeDropout < 1.0:
                 # write unit
-                info = tf.nn.dropout(info, self.dropouts["write"])
+                info = tf.compat.v1.nn.dropout(info, self.dropouts["write"])
             
             newMemory = self.write(memory, info, newControl, self.contControl, name = cellName, reuse = cellReuse)
 
@@ -495,9 +495,9 @@ def __call__(self, inputs, state, scope = None):
     '''
     def initState(self, name, dim, initType, batchSize):
         if initType == "PRM":
-            prm = tf.get_variable(name, shape = (dim, ),
+            prm = tf.compat.v1.get_variable(name, shape = (dim, ),
                     initializer = tf.random_normal_initializer())                
-            initState = tf.tile(tf.expand_dims(prm, axis = 0), [batchSize, 1])
+            initState = tf.compat.v1.tile(tf.expand_dims(prm, axis = 0), [batchSize, 1])
         elif initType == "ZERO":
             initState = tf.zeros((batchSize, dim), dtype = tf.float32)
         else: # "Q"
@@ -516,8 +516,8 @@ def initState(self, name, dim, initType, batchSize):
 
     Returns the updated word sequence and lengths.  
     '''
-    def addNullWord(words, lengths):
-        nullWord = tf.get_variable("zeroWord", shape = (1 , config.ctrlDim), initializer = tf.random_normal_initializer())                    
+    def addNullWord(self, words, lengths):
+        nullWord = tf.compat.v1.get_variable("zeroWord", shape = (1 , config.ctrlDim), initializer = tf.random_normal_initializer())
         nullWord = tf.tile(tf.expand_dims(nullWord, axis = 0), [self.batchSize, 1, 1])
         words = tf.concat([nullWord, words], axis = 1)
         lengths += 1
@@ -582,7 +582,7 @@ def zero_state(self, batchSize, dtype = tf.float32):
 
         # if config.controlCoverage:
         #     self.coverage = tf.zeros((batchSize, tf.shape(words)[1]), dtype = tf.float32)
-        #     self.coverageBias = tf.get_variable("coverageBias", shape = (),
+        #     self.coverageBias = tf.compat.v1.get_variable("coverageBias", shape = (),
         #         initializer = config.controlCoverageBias)  
 
         ## initialize memory variational dropout mask
diff --git a/main.py b/main.py
index 198992f2..4ce392b7 100644
--- a/main.py
+++ b/main.py
@@ -23,6 +23,8 @@
 from model import MACnet
 from collections import defaultdict
 
+tf.compat.v1.disable_eager_execution()
+
 ############################################# loggers #############################################
 
 # Writes log header to file 
@@ -151,7 +153,7 @@ def writePreds(preprocessor, evalRes, extraEvalRes):
 ############################################# session #############################################
 # Initializes TF session. Sets GPU memory configuration.
 def setSession():
-    sessionConfig = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False)
+    sessionConfig = tf.compat.v1.ConfigProto(allow_soft_placement = True, log_device_placement = False)
     if config.allowGrowth:
         sessionConfig.gpu_options.allow_growth = True
     if config.maxMemory < 1.0:
@@ -161,17 +163,17 @@ def setSession():
 ############################################## savers #############################################
 # Initializes savers (standard, optional exponential-moving-average and optional for subset of variables)
 def setSavers(model):
-    saver = tf.train.Saver(max_to_keep = config.weightsToKeep)
+    saver = tf.compat.v1.train.Saver(max_to_keep = config.weightsToKeep)
 
     subsetSaver = None
     if config.saveSubset:
         isRelevant = lambda var: any(s in var.name for s in config.varSubset)
-        relevantVars = [var for var in tf.global_variables() if isRelevant(var)]
-        subsetSaver = tf.train.Saver(relevantVars, max_to_keep = config.weightsToKeep, allow_empty = True)
+        relevantVars = [var for var in tf.compat.v1.global_variables() if isRelevant(var)]
+        subsetSaver = tf.compat.v1.train.Saver(relevantVars, max_to_keep = config.weightsToKeep, allow_empty = True)
     
     emaSaver = None
     if config.useEMA: 
-        emaSaver = tf.train.Saver(model.emaDict, max_to_keep = config.weightsToKeep)
+        emaSaver = tf.compat.v1.train.Saver(model.emaDict, max_to_keep = config.weightsToKeep)
 
     return {
         "saver": saver,
@@ -657,7 +659,7 @@ def main():
         config.gpusNum = len(config.gpus.split(","))
         os.environ["CUDA_VISIBLE_DEVICES"] = config.gpus
 
-    tf.logging.set_verbosity(tf.logging.ERROR)
+    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 
     # process data
     print(bold("Preprocess data..."))
@@ -673,7 +675,7 @@ def main():
     print("took {} seconds".format(bcolored("{:.2f}".format(time.time() - start), "blue")))
 
     # initializer
-    init = tf.global_variables_initializer()
+    init = tf.compat.v1.global_variables_initializer()
 
     # savers
     savers = setSavers(model)
@@ -682,7 +684,7 @@ def main():
     # sessionConfig
     sessionConfig = setSession()
     
-    with tf.Session(config = sessionConfig) as sess:
+    with tf.compat.v1.Session(config = sessionConfig) as sess:
 
         # ensure no more ops are added after model is built
         sess.graph.finalize()
@@ -711,7 +713,7 @@ def main():
                 # save weights
                 saver.save(sess, config.weightsFile(epoch))
                 if config.saveSubset:
-                    subsetSaver.save(sess, config.subsetWeightsFile(epoch))                   
+                    config.saveSubset.save(sess, config.subsetWeightsFile(epoch))
                 
                 # load EMA weights 
                 if config.useEMA:
diff --git a/mi_gru_cell.py b/mi_gru_cell.py
index f2ed2ee6..30851f1e 100644
--- a/mi_gru_cell.py
+++ b/mi_gru_cell.py
@@ -1,7 +1,7 @@
 import tensorflow as tf
 import numpy as np
 
-class MiGRUCell(tf.nn.rnn_cell.RNNCell):
+class MiGRUCell(tf.compat.v1.nn.rnn_cell.RNNCell):
     def __init__(self, num_units, input_size = None, activation = tf.tanh, reuse = None):
         self.numUnits = num_units
         self.activation = activation
@@ -16,19 +16,19 @@ def output_size(self):
         return self.numUnits
 
     def mulWeights(self, inp, inDim, outDim, name = ""): 
-        with tf.variable_scope("weights" + name):
-            W = tf.get_variable("weights", shape = (inDim, outDim),
-                initializer = tf.contrib.layers.xavier_initializer())
+        with tf.compat.v1.variable_scope("weights" + name):
+            W = tf.compat.v1.get_variable("weights", shape = (inDim, outDim),
+                initializer = tf.compat.v1.keras.initializers.glorot_normal())
 
         output = tf.matmul(inp, W)        
         return output
 
     def addBiases(self, inp1, inp2, dim, bInitial = 0, name = ""):
-        with tf.variable_scope("additiveBiases" + name):
-            b = tf.get_variable("biases", shape = (dim,), 
+        with tf.compat.v1.variable_scope("additiveBiases" + name):
+            b = tf.compat.v1.get_variable("biases", shape = (dim,), 
                 initializer = tf.zeros_initializer()) + bInitial
-        with tf.variable_scope("multiplicativeBias" + name):
-            beta = tf.get_variable("biases", shape = (3 * dim,), 
+        with tf.compat.v1.variable_scope("multiplicativeBias" + name):
+            beta = tf.compat.v1.get_variable("biases", shape = (3 * dim,), 
                 initializer = tf.ones_initializer())
 
         Wx, Uh, inter = tf.split(beta * tf.concat([inp1, inp2, inp1 * inp2], axis = 1), 
@@ -38,7 +38,7 @@ def addBiases(self, inp1, inp2, dim, bInitial = 0, name = ""):
 
     def __call__(self, inputs, state, scope = None):
         scope = scope or type(self).__name__
-        with tf.variable_scope(scope, reuse = self.reuse):
+        with tf.compat.v1.variable_scope(scope, reuse = self.reuse):
             inputSize = int(inputs.shape[1])
             
             Wxr = self.mulWeights(inputs, inputSize, self.numUnits, name = "Wxr")
diff --git a/mi_lstm_cell.py b/mi_lstm_cell.py
index 6cfa2f66..3764be78 100644
--- a/mi_lstm_cell.py
+++ b/mi_lstm_cell.py
@@ -1,7 +1,7 @@
 import tensorflow as tf
 import numpy as np
 
-class MiLSTMCell(tf.nn.rnn_cell.RNNCell):
+class MiLSTMCell(tf.compat.v1.nn.rnn_cell.RNNCell):
     def __init__(self, num_units, forget_bias = 1.0, input_size = None,
                state_is_tuple = True, activation = tf.tanh, reuse = None):
         self.numUnits = num_units
@@ -11,25 +11,25 @@ def __init__(self, num_units, forget_bias = 1.0, input_size = None,
 
     @property
     def state_size(self):
-        return tf.nn.rnn_cell.LSTMStateTuple(self.numUnits, self.numUnits)          
+        return tf.compat.v1.nn.rnn_cell.LSTMStateTuple(self.numUnits, self.numUnits)          
 
     @property
     def output_size(self):
         return self.numUnits
 
     def mulWeights(self, inp, inDim, outDim, name = ""):
-        with tf.variable_scope("weights" + name):
-            W = tf.get_variable("weights", shape = (inDim, outDim),
-                initializer = tf.contrib.layers.xavier_initializer())
+        with tf.compat.v1.variable_scope("weights" + name):
+            W = tf.compat.v1.get_variable("weights", shape = (inDim, outDim),
+                initializer = tf.compat.v1.keras.initializers.glorot_normal())
         output = tf.matmul(inp, W)        
         return output
 
     def addBiases(self, inp1, inp2, dim, name = ""):
-        with tf.variable_scope("additiveBiases" + name):
-            b = tf.get_variable("biases", shape = (dim,), 
+        with tf.compat.v1.variable_scope("additiveBiases" + name):
+            b = tf.compat.v1.get_variable("biases", shape = (dim,), 
                 initializer = tf.zeros_initializer())
-        with tf.variable_scope("multiplicativeBias" + name):
-            beta = tf.get_variable("biases", shape = (3 * dim,), 
+        with tf.compat.v1.variable_scope("multiplicativeBias" + name):
+            beta = tf.compat.v1.get_variable("biases", shape = (3 * dim,), 
                 initializer = tf.ones_initializer())
 
         Wx, Uh, inter = tf.split(beta * tf.concat([inp1, inp2, inp1 * inp2], axis = 1), 
@@ -39,7 +39,7 @@ def addBiases(self, inp1, inp2, dim, name = ""):
 
     def __call__(self, inputs, state, scope = None):
         scope = scope or type(self).__name__
-        with tf.variable_scope(scope, reuse = self.reuse):
+        with tf.compat.v1.variable_scope(scope, reuse = self.reuse):
             c, h = state        
             inputSize = int(inputs.shape[1])
 
@@ -68,10 +68,10 @@ def __call__(self, inputs, state, scope = None):
                     self.activation(j))
             newH = self.activation(newC) * tf.nn.sigmoid(o)
 
-            newState = tf.nn.rnn_cell.LSTMStateTuple(newC, newH)
+            newState = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(newC, newH)
         return newH, newState
 
     def zero_state(self, batchSize, dtype = tf.float32):
-        return tf.nn.rnn_cell.LSTMStateTuple(tf.zeros((batchSize, self.numUnits), dtype = dtype),
+        return tf.compat.v1.nn.rnn_cell.LSTMStateTuple(tf.zeros((batchSize, self.numUnits), dtype = dtype),
                                         tf.zeros((batchSize, self.numUnits), dtype = dtype))
         
\ No newline at end of file
diff --git a/model.py b/model.py
index f3114089..4a81229a 100644
--- a/model.py
+++ b/model.py
@@ -56,40 +56,40 @@ def __init__(self, embeddingsInit, answerDict):
     '''
     # change to H x W x C?
     def addPlaceholders(self):
-        with tf.variable_scope("Placeholders"):
+        with tf.compat.v1.variable_scope("Placeholders"):
             ## data
             # questions            
-            self.questionsIndicesAll = tf.placeholder(tf.int32, shape = (None, None))
-            self.questionLengthsAll = tf.placeholder(tf.int32, shape = (None, ))
+            self.questionsIndicesAll = tf.compat.v1.placeholder(tf.int32, shape = (None, None))
+            self.questionLengthsAll = tf.compat.v1.placeholder(tf.int32, shape = (None, ))
 
             # images
             # put image known dimension as last dim?
-            self.imagesPlaceholder = tf.placeholder(tf.float32, shape = (None, None, None, None))
+            self.imagesPlaceholder = tf.compat.v1.placeholder(tf.float32, shape = (None, None, None, None))
             self.imagesAll = tf.transpose(self.imagesPlaceholder, (0, 2, 3, 1))
             # self.imageH = tf.shape(self.imagesAll)[1]
             # self.imageW = tf.shape(self.imagesAll)[2]
 
             # answers
-            self.answersIndicesAll = tf.placeholder(tf.int32, shape = (None, ))
+            self.answersIndicesAll = tf.compat.v1.placeholder(tf.int32, shape = (None, ))
 
             ## optimization
-            self.lr = tf.placeholder(tf.float32, shape = ())
-            self.train = tf.placeholder(tf.bool, shape = ())
+            self.lr = tf.compat.v1.placeholder(tf.float32, shape = ())
+            self.train = tf.compat.v1.placeholder(tf.bool, shape = ())
             self.batchSizeAll = tf.shape(self.questionsIndicesAll)[0]
 
             ## dropouts
             # TODO: change dropouts to be 1 - current
             self.dropouts = {
-                "encInput": tf.placeholder(tf.float32, shape = ()),
-                "encState": tf.placeholder(tf.float32, shape = ()),
-                "stem": tf.placeholder(tf.float32, shape = ()),
-                "question": tf.placeholder(tf.float32, shape = ()),
-                # self.dropouts["question"]Out = tf.placeholder(tf.float32, shape = ())
-                # self.dropouts["question"]MAC = tf.placeholder(tf.float32, shape = ())
-                "read": tf.placeholder(tf.float32, shape = ()),
-                "write": tf.placeholder(tf.float32, shape = ()),
-                "memory": tf.placeholder(tf.float32, shape = ()),
-                "output": tf.placeholder(tf.float32, shape = ())
+                "encInput": tf.compat.v1.placeholder(tf.float32, shape = ()),
+                "encState": tf.compat.v1.placeholder(tf.float32, shape = ()),
+                "stem": tf.compat.v1.placeholder(tf.float32, shape = ()),
+                "question": tf.compat.v1.placeholder(tf.float32, shape = ()),
+                # self.dropouts["question"]Out = tf.compat.v1.placeholder(tf.float32, shape = ())
+                # self.dropouts["question"]MAC = tf.compat.v1.placeholder(tf.float32, shape = ())
+                "read": tf.compat.v1.placeholder(tf.float32, shape = ()),
+                "write": tf.compat.v1.placeholder(tf.float32, shape = ()),
+                "memory": tf.compat.v1.placeholder(tf.float32, shape = ()),
+                "output": tf.compat.v1.placeholder(tf.float32, shape = ())
             }
 
             # batch norm params
@@ -103,7 +103,7 @@ def addPlaceholders(self):
             #     self.dropouts["read"] = self.dropouts["_read"]
             
             # if config.tempDynamic:
-            #     self.tempAnnealRate = tf.placeholder(tf.float32, shape = ())
+            #     self.tempAnnealRate = tf.compat.v1.placeholder(tf.float32, shape = ())
 
             self.H, self.W, self.imageInDim = config.imageDims
 
@@ -137,7 +137,7 @@ def createFeedDict(self, data, images, train):
 
     # Splits data to a specific GPU (tower) for parallelization
     def initTowerBatch(self, towerI, towersNum, dataSize):
-        towerBatchSize = tf.floordiv(dataSize, towersNum)
+        towerBatchSize = tf.compat.v1.floordiv(dataSize, towersNum)
         start = towerI * towerBatchSize
         end = (towerI + 1) * towerBatchSize if towerI < towersNum - 1 else dataSize
 
@@ -164,7 +164,7 @@ def initTowerBatch(self, towerI, towersNum, dataSize):
     '''
     def stem(self, images, inDim, outDim, addLoc = None):
 
-        with tf.variable_scope("stem"):        
+        with tf.compat.v1.variable_scope("stem"):
             if addLoc is None:
                 addLoc = config.locationAware
 
@@ -206,13 +206,13 @@ def stem(self, images, inDim, outDim, addLoc = None):
     # Embed question using parametrized word embeddings.
     # The embedding are initialized to the values supported to the class initialization
     def qEmbeddingsOp(self, qIndices, embInit):
-        with tf.variable_scope("qEmbeddings"):
+        with tf.compat.v1.variable_scope("qEmbeddings"):
             # if config.useCPU:
             #     with tf.device('/cpu:0'):
             #         embeddingsVar = tf.Variable(self.embeddingsInit, name = "embeddings", dtype = tf.float32)
             # else:
             #     embeddingsVar = tf.Variable(self.embeddingsInit, name = "embeddings", dtype = tf.float32)
-            embeddingsVar = tf.get_variable("emb", initializer = tf.to_float(embInit), 
+            embeddingsVar = tf.compat.v1.get_variable("emb", initializer = tf.compat.v1.to_float(embInit),
                 dtype = tf.float32, trainable = (not config.wrdEmbFixed))
             embeddings = tf.concat([tf.zeros((1, config.wrdEmbDim)), embeddingsVar], axis = 0)
             questions = tf.nn.embedding_lookup(embeddings, qIndices)
@@ -221,10 +221,10 @@ def qEmbeddingsOp(self, qIndices, embInit):
 
     # Embed answer words
     def aEmbeddingsOp(self, embInit):
-        with tf.variable_scope("aEmbeddings"):
+        with tf.compat.v1.variable_scope("aEmbeddings"):
             if embInit is None:
                 return None
-            answerEmbeddings = tf.get_variable("emb", initializer = tf.to_float(embInit), 
+            answerEmbeddings = tf.compat.v1.get_variable("emb", initializer = tf.compat.v1.to_float(embInit),
                 dtype = tf.float32)
         return answerEmbeddings
 
@@ -279,7 +279,7 @@ def embeddingsOp(self, qIndices, embInit):
     def encoder(self, questions, questionLengths, projWords = False, 
         projQuestion = False, projDim = None):
         
-        with tf.variable_scope("encoder"):
+        with tf.compat.v1.variable_scope("encoder"):
             # variational dropout option
             varDp = None
             if config.encVariationalDropout:
@@ -294,7 +294,7 @@ def encoder(self, questions, questionLengths, projWords = False,
                     dropout = self.dropouts["encInput"], varDp = varDp, name = "rnn%d" % i)
 
             # dropout for the question vector
-            vecQuestions = tf.nn.dropout(vecQuestions, self.dropouts["question"])
+            vecQuestions = tf.compat.v1.nn.dropout(vecQuestions, self.dropouts["question"])
             
             # projection of encoder outputs 
             if projWords:
@@ -325,7 +325,7 @@ def encoder(self, questions, questionLengths, projWords = False,
     Returns the new memory value.
     '''
     def baselineAttLayer(self, images, memory, inDim, hDim, name = "", reuse = None):
-        with tf.variable_scope("attLayer" + name, reuse = reuse):         
+        with tf.compat.v1.variable_scope("attLayer" + name, reuse = reuse):
             # projImages = ops.linear(images, inDim, hDim, name = "projImage")
             # projMemory = tf.expand_dims(ops.linear(memory, inDim, hDim, name = "projMemory"), axis = -2)       
             # if config.saMultiplicative:
@@ -368,7 +368,7 @@ def baselineAttLayer(self, images, memory, inDim, hDim, name = "", reuse = None)
     [batchSize, outDim] (out dimension depends on baseline method)
     '''
     def baseline(self, vecQuestions, questionDim, images, imageDim, hDim):
-        with tf.variable_scope("baseline"):
+        with tf.compat.v1.variable_scope("baseline"):
             if config.baselineAtt:  
                 memory = self.linear(vecQuestions, questionDim, hDim, name = "qProj")
                 images = self.linear(images, imageDim, hDim, name = "iProj")
@@ -428,7 +428,7 @@ def baseline(self, vecQuestions, questionDim, images, imageDim, hDim):
     def MACnetwork(self, images, vecQuestions, questionWords, questionCntxWords, 
         questionLengths, name = "", reuse = None):
 
-        with tf.variable_scope("MACnetwork" + name, reuse = reuse):
+        with tf.compat.v1.variable_scope("MACnetwork" + name, reuse = reuse):
             
             self.macCell = MACCell(
                 vecQuestions = vecQuestions,
@@ -453,7 +453,7 @@ def MACnetwork(self, images, vecQuestions, questionWords, questionCntxWords,
             for i in range(config.netLength):
                 self.macCell.iteration = i
                 # if config.unsharedCells:
-                    # with tf.variable_scope("iteration%d" % i):
+                    # with tf.compat.v1.variable_scope("iteration%d" % i):
                     # macCell.myNameScope = "iteration%d" % i
                 _, state = self.macCell(none, state)                     
                 # else:
@@ -510,7 +510,7 @@ def MACnetwork(self, images, vecQuestions, questionWords, questionCntxWords,
     Returns the resulted features and their dimension. 
     '''  
     def outputOp(self, memory, vecQuestions, images, imageInDim):
-        with tf.variable_scope("outputUnit"):            
+        with tf.compat.v1.variable_scope("outputUnit"):
             features = memory
             dim = config.memDim
 
@@ -545,7 +545,7 @@ def outputOp(self, memory, vecQuestions, images, imageInDim):
     [batchSize, answerWordsNum]
     '''
     def classifier(self, features, inDim, aEmbeddings = None):
-        with tf.variable_scope("classifier"):                    
+        with tf.compat.v1.variable_scope("classifier"):
             outDim = config.answerWordsNum
             dims = [inDim] + config.outClassifierDims + [outDim]
             if config.answerMod != "NON":
@@ -557,10 +557,10 @@ def classifier(self, features, inDim, aEmbeddings = None):
                 dropout = self.dropouts["output"]) 
             
             if config.answerMod != "NON":
-                logits = tf.nn.dropout(logits, self.dropouts["output"])
+                logits = tf.compat.v1.nn.dropout(logits, self.dropouts["output"])
                 interactions = ops.mul(aEmbeddings, logits, dims[-1], interMod = config.answerMod)
                 logits = ops.inter2logits(interactions, dims[-1], sumMod = "SUM")
-                logits += ops.getBias((outputDim, ), "ans")
+                logits += ops.getBias((outDim, ), "ans")
 
                 # answersWeights = tf.transpose(aEmbeddings)
 
@@ -576,9 +576,9 @@ def classifier(self, features, inDim, aEmbeddings = None):
         return logits
 
     # def getTemp():
-    #     with tf.variable_scope("temperature"):
+    #     with tf.compat.v1.variable_scope("temperature"):
     #         if config.tempParametric:
-    #             self.temperatureVar = tf.get_variable("temperature", shape = (), 
+    #             self.temperatureVar = tf.compat.v1.get_variable("temperature", shape = (),
     #                 initializer = tf.constant_initializer(5), dtype = tf.float32)
     #             temperature = tf.sigmoid(self.temperatureVar)
     #         else:
@@ -591,7 +591,7 @@ def classifier(self, features, inDim, aEmbeddings = None):
 
     # Computes mean cross entropy loss between logits and answers.
     def addAnswerLossOp(self, logits, answers):
-        with tf.variable_scope("answerLoss"):
+        with tf.compat.v1.variable_scope("answerLoss"):
             losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = answers, logits = logits)
             loss = tf.reduce_mean(losses)
             self.answerLossList.append(loss)
@@ -601,11 +601,11 @@ def addAnswerLossOp(self, logits, answers):
     # Computes predictions (by finding maximal logit value, corresponding to highest probability)
     # and mean accuracy between predictions and answers. 
     def addPredOp(self, logits, answers):
-        with tf.variable_scope("pred"):
-            preds = tf.to_int32(tf.argmax(logits, axis = -1)) # tf.nn.softmax( 
+        with tf.compat.v1.variable_scope("pred"):
+            preds = tf.compat.v1.to_int32(tf.argmax(logits, axis = -1)) # tf.nn.softmax(
             corrects = tf.equal(preds, answers) 
-            correctNum = tf.reduce_sum(tf.to_int32(corrects))
-            acc = tf.reduce_mean(tf.to_float(corrects))
+            correctNum = tf.reduce_sum(tf.compat.v1.to_int32(corrects))
+            acc = tf.reduce_mean(tf.compat.v1.to_float(corrects))
             self.correctNumList.append(correctNum) 
             self.answerAccList.append(acc)
 
@@ -613,9 +613,9 @@ def addPredOp(self, logits, answers):
 
     # Creates optimizer (adam)
     def addOptimizerOp(self): 
-        with tf.variable_scope("trainAddOptimizer"):            
+        with tf.compat.v1.variable_scope("trainAddOptimizer"):
             self.globalStep = tf.Variable(0, dtype = tf.int32, trainable = False, name = "globalStep") # init to 0 every run?
-            optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
+            optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = self.lr)
 
         return optimizer
 
@@ -624,10 +624,10 @@ def addOptimizerOp(self):
     using optimizer.
     '''
     def computeGradients(self, optimizer, loss, trainableVars = None): # tf.trainable_variables()
-        with tf.variable_scope("computeGradients"):            
+        with tf.compat.v1.variable_scope("computeGradients"):
             if config.trainSubset:
                 trainableVars = []
-                allVars = tf.trainable_variables()
+                allVars = tf.compat.v1.trainable_variables()
                 for var in allVars:
                     if any((s in var.name) for s in config.varSubset):
                         trainableVars.append(var)
@@ -640,9 +640,9 @@ def computeGradients(self, optimizer, loss, trainableVars = None): # tf.trainabl
     for parameters.
     '''
     def addTrainingOp(self, optimizer, gradients_vars):
-        with tf.variable_scope("train"):
+        with tf.compat.v1.variable_scope("train"):
             gradients, variables = zip(*gradients_vars)
-            norm = tf.global_norm(gradients)
+            norm = tf.compat.v1.global_norm(gradients)
 
             # gradient clipping
             if config.clipGradients:            
@@ -650,14 +650,14 @@ def addTrainingOp(self, optimizer, gradients_vars):
                 gradients_vars = zip(clippedGradients, variables)
 
             # updates ops (for batch norm) and train op
-            updateOps = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            updateOps = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
             with tf.control_dependencies(updateOps):
                 train = optimizer.apply_gradients(gradients_vars, global_step = self.globalStep)
 
             # exponential moving average
             if config.useEMA:
                 ema = tf.train.ExponentialMovingAverage(decay = config.emaDecayRate)
-                maintainAveragesOp = ema.apply(tf.trainable_variables())
+                maintainAveragesOp = ema.apply(tf.compat.v1.trainable_variables())
 
                 with tf.control_dependencies([train]):
                     trainAndUpdateOp = tf.group(maintainAveragesOp)
@@ -771,7 +771,7 @@ def build(self):
         self.answerAccList = []
         self.predsList = []
 
-        with tf.variable_scope("macModel"):
+        with tf.compat.v1.variable_scope("macModel"):
             for i in range(config.gpusNum):
                 with tf.device("/gpu:{}".format(i)):
                     with tf.name_scope("tower{}".format(i)) as scope:
@@ -821,7 +821,7 @@ def build(self):
                         self.gradientVarsList.append(gradient_vars)
 
                         # reuse variables in next towers
-                        tf.get_variable_scope().reuse_variables()
+                        tf.compat.v1.get_variable_scope().reuse_variables()
 
         self.averageAcrossTowers(config.gpusNum)
 
diff --git a/ops.py b/ops.py
index 509ff0d2..c2401cdb 100644
--- a/ops.py
+++ b/ops.py
@@ -1,6 +1,7 @@
 from __future__ import division
 import math
 import tensorflow as tf
+import tensorflow_addons as tfa
 
 from mi_gru_cell import MiGRUCell
 from mi_lstm_cell import MiLSTMCell
@@ -16,29 +17,29 @@
 Uses random_normal initialization if 1d, otherwise uses xavier. 
 '''
 def getWeight(shape, name = ""):
-    with tf.variable_scope("weights"):               
-        initializer = tf.contrib.layers.xavier_initializer()
+    with tf.compat.v1.variable_scope("weights"):               
+        initializer = tf.compat.v1.keras.initializers.glorot_normal()
         # if len(shape) == 1: # good?
         #     initializer = tf.random_normal_initializer()        
-        W = tf.get_variable("weight" + name, shape = shape, initializer = initializer)
+        W = tf.compat.v1.get_variable("weight" + name, shape = shape, initializer = initializer)
     return W
 
 '''
 Initializes a weight matrix variable given a shape and a name. Uses xavier
 '''
 def getKernel(shape, name = ""):
-    with tf.variable_scope("kernels"):               
-        initializer = tf.contrib.layers.xavier_initializer()
-        W = tf.get_variable("kernel" + name, shape = shape, initializer = initializer)
+    with tf.compat.v1.variable_scope("kernels"):               
+        initializer = tf.compat.v1.keras.initializers.glorot_normal()
+        W = tf.compat.v1.get_variable("kernel" + name, shape = shape, initializer = initializer)
     return W
 
 '''
 Initializes a bias variable given a shape and a name.
 '''
 def getBias(shape, name = ""):
-    with tf.variable_scope("biases"):              
+    with tf.compat.v1.variable_scope("biases"):              
         initializer = tf.zeros_initializer()
-        b = tf.get_variable("bias" + name, shape = shape, initializer = initializer)
+        b = tf.compat.v1.get_variable("bias" + name, shape = shape, initializer = initializer)
     return b
 
 ######################################### basics #########################################
@@ -86,7 +87,7 @@ def L2RegularizationOp(l2 = None):
         l2 = config.l2
     l2Loss = 0
     names = ["weight", "kernel"]
-    for var in tf.trainable_variables():
+    for var in tf.compat.v1.trainable_variables():
         if any((name in var.name.lower()) for name in names):
             l2Loss += tf.nn.l2_loss(var)
     return l2 * l2Loss
@@ -112,7 +113,7 @@ def L2RegularizationOp(l2 = None):
 '''
 sumMod = ["LIN", "SUM"]
 def inter2logits(interactions, dim, sumMod = "LIN", dropout = 1.0, name = "", reuse = None):
-    with tf.variable_scope("inter2logits" + name, reuse = reuse): 
+    with tf.compat.v1.variable_scope("inter2logits" + name, reuse = reuse): 
         if sumMod == "SUM":
             logits = tf.reduce_sum(interactions, axis = -1)
         else: # "LIN"
@@ -138,7 +139,7 @@ def inter2logits(interactions, dim, sumMod = "LIN", dropout = 1.0, name = "", re
 [batchSize, N]
 '''
 def inter2att(interactions, dim, dropout = 1.0, name = "", reuse = None):
-    with tf.variable_scope("inter2att" + name, reuse = reuse): 
+    with tf.compat.v1.variable_scope("inter2att" + name, reuse = reuse): 
         logits = inter2logits(interactions, dim, dropout = dropout)
         attention = tf.nn.softmax(logits)    
     return attention
@@ -160,8 +161,8 @@ def att2Smry(attention, features):
 '''
 def relu(inp):                  
     if config.relu == "PRM":
-        with tf.variable_scope(None, default_name = "prelu"):
-            alpha = tf.get_variable("alpha", shape = inp.get_shape()[-1], 
+        with tf.compat.v1.variable_scope(None, default_name = "prelu"):
+            alpha = tf.compat.v1.get_variable("alpha", shape = inp.get_shape()[-1], 
                 initializer = tf.constant_initializer(0.25))
             pos = tf.nn.relu(inp)
             neg = - (alpha * tf.nn.relu(-inp))
@@ -188,8 +189,8 @@ def relu(inp):
 
 # Sample from Gumbel(0, 1)
 def sampleGumbel(shape): 
-    U = tf.random_uniform(shape, minval = 0, maxval = 1)
-    return -tf.log(-tf.log(U + eps) + eps)
+    U = tf.compat.v1.random_uniform(shape, minval = 0, maxval = 1)
+    return -tf.compat.v1.log(-tf.compat.v1.log(U + eps) + eps)
 
 # Draw a sample from the Gumbel-Softmax distribution
 def gumbelSoftmaxSample(logits, temperature): 
@@ -229,7 +230,7 @@ def softmaxDiscrete(logits, temperature, train):
         return tf.nn.softmax(logits)
 
 def parametricDropout(name, train):
-    var = tf.get_variable("varDp" + name, shape = (), initializer = tf.constant_initializer(2), 
+    var = tf.compat.v1.get_variable("varDp" + name, shape = (), initializer = tf.constant_initializer(2), 
         dtype = tf.float32)
     dropout = tf.cond(train, lambda: tf.sigmoid(var), lambda: 1.0)
     return dropout
@@ -251,7 +252,7 @@ def expMask(seq, seqLength):
 '''
 def seq2SeqLoss(logits, targets, lengths):
     mask = tf.sequence_mask(lengths, maxlen = tf.shape(targets)[1])
-    loss = tf.contrib.seq2seq.sequence_loss(logits, targets, tf.to_float(mask))
+    loss = tfa.seq2seq.sequence_loss(logits, targets, tf.compat.v1.to_float(mask))
     return loss
 
 '''
@@ -262,12 +263,12 @@ def seq2SeqLoss(logits, targets, lengths):
 def seq2seqAcc(preds, targets, lengths):
     mask = tf.sequence_mask(lengths, maxlen = tf.shape(targets)[1])
     corrects = tf.logical_and(tf.equal(preds, targets), mask)
-    numCorrects = tf.reduce_sum(tf.to_int32(corrects), axis = 1)
+    numCorrects = tf.reduce_sum(tf.compat.v1.to_int32(corrects), axis = 1)
     
-    acc1 = tf.to_float(numCorrects) / (tf.to_float(lengths) + eps) # add small eps instead?
+    acc1 = tf.compat.v1.to_float(numCorrects) / (tf.compat.v1.to_float(lengths) + eps) # add small eps instead?
     acc1 = tf.reduce_mean(acc1)  
     
-    acc2 = tf.to_float(tf.equal(numCorrects, lengths))
+    acc2 = tf.compat.v1.to_float(tf.equal(numCorrects, lengths))
     acc2 = tf.reduce_mean(acc2)      
 
     return acc1, acc2
@@ -300,16 +301,16 @@ def linear(inp, inDim, outDim, dropout = 1.0,
     act = "NON", actLayer = True, actDropout = 1.0, 
     retVars = False, name = "", reuse = None):
     
-    with tf.variable_scope("linearLayer" + name, reuse = reuse):        
+    with tf.compat.v1.variable_scope("linearLayer" + name, reuse = reuse):        
         W = getWeight((inDim, outDim) if outDim > 1 else (inDim, ))
         b = getBias((outDim, ) if outDim > 1 else ()) + bias
         
         if batchNorm is not None:
-            inp = tf.contrib.layers.batch_norm(inp, decay = batchNorm["decay"], 
+            inp = tf.keras.layers.BatchNormalization(inp, decay = batchNorm["decay"],
                 center = True, scale = True, is_training = batchNorm["train"], updates_collections = None)
             # tf.layers.batch_normalization, axis -1 ?
 
-        inp = tf.nn.dropout(inp, dropout)                
+        inp = tf.compat.v1.nn.dropout(inp, dropout)
         
         if outDim > 1:
             output = multiply(inp, W)
@@ -380,7 +381,7 @@ def FCLayer(features, dims, batchNorm = None, dropout = 1.0, act = "RELU"):
 def cnn(inp, inDim, outDim, batchNorm = None, dropout = 1.0, addBias = True, 
     kernelSize = None, stride = 1, act = "NON", name = "", reuse = None):
     
-    with tf.variable_scope("cnnLayer" + name, reuse = reuse):
+    with tf.compat.v1.variable_scope("cnnLayer" + name, reuse = reuse):
         
         if kernelSize is None:
             kernelSize = config.stemKernelSize            
@@ -390,12 +391,12 @@ def cnn(inp, inDim, outDim, batchNorm = None, dropout = 1.0, addBias = True,
         b = getBias((outDim, ))
         
         if batchNorm is not None:
-            inp = tf.contrib.layers.batch_norm(inp, decay = batchNorm["decay"], center = batchNorm["center"], 
+            inp = tf.keras.layers.BatchNormalization(inp, decay = batchNorm["decay"], center = batchNorm["center"], 
                 scale = batchNorm["scale"], is_training = batchNorm["train"], updates_collections = None)   
 
-        inp = tf.nn.dropout(inp, dropout)                
+        inp = tf.compat.v1.nn.dropout(inp, dropout)
         
-        output = tf.nn.conv2d(inp, filter = kernel, strides = [1, stride, stride, 1], padding = "SAME")
+        output = tf.compat.v1.nn.conv2d(inp, filter = kernel, strides = [1, stride, stride, 1], padding = "SAME")
         
         if addBias:
             output += b
@@ -464,9 +465,9 @@ def locationL(h, w, dim, outDim = -1, addBias = True):
 # dim % 4 = 0
 # h,w can be tensor scalars
 def locationPE(h, w, dim, outDim = -1, addBias = True):    
-    x = tf.expand_dims(tf.to_float(tf.linspace(-config.locationBias, config.locationBias, w)), axis = -1)
-    y = tf.expand_dims(tf.to_float(tf.linspace(-config.locationBias, config.locationBias, h)), axis = -1)
-    i = tf.expand_dims(tf.to_float(tf.range(dim)), axis = 0)
+    x = tf.expand_dims(tf.compat.v1.to_float(tf.linspace(-config.locationBias, config.locationBias, w)), axis = -1)
+    y = tf.expand_dims(tf.compat.v1.to_float(tf.linspace(-config.locationBias, config.locationBias, h)), axis = -1)
+    i = tf.expand_dims(tf.compat.v1.to_float(tf.range(dim)), axis = 0)
 
     peSinX = tf.sin(x / (tf.pow(10000.0, i / dim)))
     peCosX = tf.cos(x / (tf.pow(10000.0, i / dim)))
@@ -514,7 +515,7 @@ def locationPE(h, w, dim, outDim = -1, addBias = True):
 def addLocation(features, inDim, lDim, outDim = -1, h = None, w = None, 
     locType = "L", mod = "CNCT", name = "", reuse = None): # h,w not needed
     
-    with tf.variable_scope("addLocation" + name, reuse = reuse):
+    with tf.compat.v1.variable_scope("addLocation" + name, reuse = reuse):
         batchSize = tf.shape(features)[0]
         if h is None:
             h = tf.shape(features)[1]
@@ -668,15 +669,15 @@ def linearizeFeatures(features, h, w, inDim, projDim = None, outDim = None,
 def mul(x, y, dim, dropout = 1.0, proj = None, interMod = "MUL", concat = None, mulBias = None,
     extendY = True, name = "", reuse = None):
     
-    with tf.variable_scope("mul" + name, reuse = reuse):                
+    with tf.compat.v1.variable_scope("mul" + name, reuse = reuse):                
         origVals = {"x": x, "y": y, "dim": dim}
 
-        x = tf.nn.dropout(x, dropout)
-        y = tf.nn.dropout(y, dropout)
+        x = tf.compat.v1.nn.dropout(x, dropout)
+        y = tf.compat.v1.nn.dropout(y, dropout)
         # projection
         if proj is not None:
-            x = tf.nn.dropout(x, proj.get("dropout", 1.0))
-            y = tf.nn.dropout(y, proj.get("dropout", 1.0))
+            x = tf.compat.v1.nn.dropout(x, proj.get("dropout", 1.0))
+            y = tf.compat.v1.nn.dropout(y, proj.get("dropout", 1.0))
 
             if proj["shared"]:
                 xName, xReuse = "proj", None
@@ -753,16 +754,16 @@ def createCell(hDim, reuse, cellType = None, act = None, projDim = None):
     activation = activations.get(act, None) 
 
     if cellType == "ProjLSTM":
-        cell = tf.nn.rnn_cell.LSTMCell
+        cell = tf.compat.v1.nn.rnn_cell.LSTMCell
         if projDim is None:
             projDim = config.cellDim
         cell = cell(hDim, num_proj = projDim, reuse = reuse, activation = activation)
         return cell        
 
     cells = {
-        "RNN": tf.nn.rnn_cell.BasicRNNCell,
-        "GRU": tf.nn.rnn_cell.GRUCell,
-        "LSTM": tf.nn.rnn_cell.BasicLSTMCell,
+        "RNN": tf.compat.v1.nn.rnn_cell.BasicRNNCell,
+        "GRU": tf.compat.v1.nn.rnn_cell.GRUCell,
+        "LSTM": tf.compat.v1.nn.rnn_cell.BasicLSTMCell,
         "MiGRU": MiGRUCell,
         "MiLSTM": MiLSTMCell
     }
@@ -798,27 +799,27 @@ def createCell(hDim, reuse, cellType = None, act = None, projDim = None):
 def fwRNNLayer(inSeq, seqL, hDim, cellType = None, dropout = 1.0, varDp = None, 
     name = "", reuse = None): # proj = None
     
-    with tf.variable_scope("rnnLayer" + name, reuse = reuse):
+    with tf.compat.v1.variable_scope("rnnLayer" + name, reuse = reuse):
         batchSize = tf.shape(inSeq)[0]
 
         cell = createCell(hDim, reuse, cellType) # passing reuse isn't mandatory
 
         if varDp is not None:
-            cell = tf.contrib.rnn.DropoutWrapper(cell, 
+            cell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(cell,
                 state_keep_prob = varDp["stateDp"],
                 input_keep_prob = varDp["inputDp"],
                 variational_recurrent = True, input_size = varDp["inputSize"], dtype = tf.float32)
         else:           
-            inSeq = tf.nn.dropout(inSeq, dropout)
+            inSeq = tf.compat.v1.nn.dropout(inSeq, dropout)
         
         initialState = cell.zero_state(batchSize, tf.float32)
 
-        outSeq, lastState = tf.nn.dynamic_rnn(cell, inSeq, 
+        outSeq, lastState = tf.compat.v1.nn.dynamic_rnn(cell, inSeq, 
             sequence_length = seqL, 
             initial_state = initialState,
             swap_memory = True)
             
-        if isinstance(lastState, tf.nn.rnn_cell.LSTMStateTuple):
+        if isinstance(lastState, tf.compat.v1.nn.rnn_cell.LSTMStateTuple):
             lastState = lastState.h
 
         # if proj is not None:
@@ -859,38 +860,38 @@ def fwRNNLayer(inSeq, seqL, hDim, cellType = None, dropout = 1.0, varDp = None,
 def biRNNLayer(inSeq, seqL, hDim, cellType = None, dropout = 1.0, varDp = None, 
     name = "", reuse = None): # proj = None, 
 
-    with tf.variable_scope("birnnLayer" + name, reuse = reuse):
+    with tf.compat.v1.variable_scope("birnnLayer" + name, reuse = reuse):
         batchSize = tf.shape(inSeq)[0]
 
-        with tf.variable_scope("fw"):
+        with tf.compat.v1.variable_scope("fw"):
             cellFw = createCell(hDim, reuse, cellType)
-        with tf.variable_scope("bw"):
+        with tf.compat.v1.variable_scope("bw"):
             cellBw = createCell(hDim, reuse, cellType)
         
         if varDp is not None:
-            cellFw = tf.contrib.rnn.DropoutWrapper(cellFw, 
+            cellFw = tf.compat.v1.nn.rnn_cell.DropoutWrapper(cellFw, 
                 state_keep_prob = varDp["stateDp"],
                 input_keep_prob = varDp["inputDp"],
                 variational_recurrent = True, input_size = varDp["inputSize"], dtype = tf.float32)
             
-            cellBw = tf.contrib.rnn.DropoutWrapper(cellBw, 
+            cellBw = tf.compat.v1.nn.rnn_cell.DropoutWrapper(cellBw, 
                 state_keep_prob = varDp["stateDp"],
                 input_keep_prob = varDp["inputDp"],
                 variational_recurrent = True, input_size = varDp["inputSize"], dtype = tf.float32)            
         else:
-            inSeq = tf.nn.dropout(inSeq, dropout)
+            inSeq = tf.compat.v1.nn.dropout(inSeq, dropout)
 
         initialStateFw = cellFw.zero_state(batchSize, tf.float32)
         initialStateBw = cellBw.zero_state(batchSize, tf.float32)
 
-        (outSeqFw, outSeqBw), (lastStateFw, lastStateBw) = tf.nn.bidirectional_dynamic_rnn(
+        (outSeqFw, outSeqBw), (lastStateFw, lastStateBw) = tf.compat.v1.nn.bidirectional_dynamic_rnn(
             cellFw, cellBw, inSeq, 
             sequence_length = seqL, 
             initial_state_fw = initialStateFw, 
             initial_state_bw = initialStateBw,
             swap_memory = True)
 
-        if isinstance(lastStateFw, tf.nn.rnn_cell.LSTMStateTuple):
+        if isinstance(lastStateFw, tf.compat.v1.nn.rnn_cell.LSTMStateTuple):
             lastStateFw = lastStateFw.h # take c? 
             lastStateBw = lastStateBw.h  
 
@@ -940,7 +941,7 @@ def biRNNLayer(inSeq, seqL, hDim, cellType = None, dropout = 1.0, varDp = None,
 def RNNLayer(inSeq, seqL, hDim, bi = None, cellType = None, dropout = 1.0, varDp = None, 
     name = "", reuse = None): # proj = None
     
-    with tf.variable_scope("rnnLayer" + name, reuse = reuse):
+    with tf.compat.v1.variable_scope("rnnLayer" + name, reuse = reuse):
         if bi is None:
             bi = config.encBi
         
@@ -954,7 +955,7 @@ def RNNLayer(inSeq, seqL, hDim, bi = None, cellType = None, dropout = 1.0, varDp
 # tf counterpart?
 # hDim = config.moduleDim
 def multigridRNNLayer(featrues, h, w, dim, name = "", reuse = None):
-    with tf.variable_scope("multigridRNNLayer" + name, reuse = reuse):
+    with tf.compat.v1.variable_scope("multigridRNNLayer" + name, reuse = reuse):
         featrues = linear(featrues, dim, dim / 2, name = "i")
 
         output0 = gridRNNLayer(featrues, h, w, dim, right = True, down = True, name = "rd")
@@ -965,11 +966,11 @@ def multigridRNNLayer(featrues, h, w, dim, name = "", reuse = None):
         output = tf.concat([output0, output1, output2, output3], axis = -1)
         output = linear(output, 2 * dim, dim, name = "o")
 
-    return outputs
+    return output
 
 # h,w should be constants
 def gridRNNLayer(features, h, w, dim, right, down, name = "", reuse = None):
-    with tf.variable_scope("gridRNNLayer" + name):
+    with tf.compat.v1.variable_scope("gridRNNLayer" + name):
         batchSize = tf.shape(features)[0]
 
         cell = createCell(dim, reuse = reuse, cellType = config.stemGridRnnMod, 
@@ -1001,7 +1002,7 @@ def gridRNNLayer(features, h, w, dim, right, down, name = "", reuse = None):
 
 # tf seq2seq?
 # def projRNNLayer(inSeq, seqL, hDim, labels, labelsNum, labelsDim, labelsEmb, name = "", reuse = None):
-#     with tf.variable_scope("projRNNLayer" + name):
+#     with tf.compat.v1.variable_scope("projRNNLayer" + name):
 #         batchSize = tf.shape(features)[0]
 
 #         cell = createCell(hDim, reuse = reuse)
@@ -1034,7 +1035,7 @@ def gridRNNLayer(features, h, w, dim, right, down, name = "", reuse = None):
 #             chosenOut = tf.stack(chosenList, axis = 1)
 #             outputs = (logitsOut, chosenOut)
 #         else:
-#             labels = tf.to_float(labels)
+#             labels = tf.compat.v1.to_float(labels)
 #             labels = tf.concat([tf.zeros((batchSize, 1)), labels], axis = 1)[:, :-1] # ,newaxis
 #             inSeq = tf.concat([inSeq, tf.expand_dims(labels, axis = -1)], axis = -1)
 
@@ -1052,10 +1053,10 @@ def gridRNNLayer(features, h, w, dim, right, down, name = "", reuse = None):
 probability value.
 '''
 def generateVarDpMask(shape, keepProb):
-    randomTensor = tf.to_float(keepProb)
-    randomTensor += tf.random_uniform(shape, minval = 0, maxval = 1)
+    randomTensor = tf.compat.v1.to_float(keepProb)
+    randomTensor += tf.compat.v1.random_uniform(shape, minval = 0, maxval = 1)
     binaryTensor = tf.floor(randomTensor)
-    mask = tf.to_float(binaryTensor)
+    mask = tf.compat.v1.to_float(binaryTensor)
     return mask
 
 '''
@@ -1063,5 +1064,5 @@ def generateVarDpMask(shape, keepProb):
 and a dropout probability value. 
 '''
 def applyVarDpMask(inp, mask, keepProb):
-    ret = (tf.div(inp, tf.to_float(keepProb))) * mask
+    ret = (tf.compat.v1.div(inp, tf.compat.v1.to_float(keepProb))) * mask
     return ret   
diff --git a/requirements.txt b/requirements.txt
index ae500bf8..5cf399f4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,7 @@ scipy
 torchvision
 h5py
 tensorflow
+tensorflow-addons
 tqdm
 termcolor
 matplotlib