please follow our implementation.

Lines 47 to 84 in 0df39f0

    
           class Projector(nn.Module): 
        
               def __init__(self, word_dim=1024, in_dim=256, kernel_size=3): 
        
                   super().__init__() 
        
                   self.in_dim = in_dim 
        
                   self.kernel_size = kernel_size 
        
                   # visual projector 
        
                   self.vis = nn.Sequential(  # os16 -> os4 
        
                       nn.Upsample(scale_factor=2, mode='bilinear'), 
        
                       conv_layer(in_dim * 2, in_dim * 2, 3, padding=1), 
        
                       nn.Upsample(scale_factor=2, mode='bilinear'), 
        
                       conv_layer(in_dim * 2, in_dim, 3, padding=1), 
        
                       nn.Conv2d(in_dim, in_dim, 1)) 
        
                   # textual projector 
        
                   out_dim = 1 * in_dim * kernel_size * kernel_size + 1 
        
                   self.txt = nn.Linear(word_dim, out_dim) 
        
               def forward(self, x, word): 
        
                   ''' 
        
                       x: b, 512, 26, 26 
        
                       word: b, 512 
        
                   ''' 
        
                   x = self.vis(x) 
        
                   B, C, H, W = x.size() 
        
                   # 1, b*256, 104, 104 
        
                   x = x.reshape(1, B * C, H, W) 
        
                   # txt: b, (256*3*3 + 1) -> b, 256, 3, 3 / b 
        
                   word = self.txt(word) 
        
                   weight, bias = word[:, :-1], word[:, -1] 
        
                   weight = weight.reshape(B, C, self.kernel_size, self.kernel_size) 
        
                   # Conv2d - 1, b*256, 104, 104 -> 1, b, 104, 104 
        
                   out = F.conv2d(x, 
        
                                  weight, 
        
                                  padding=self.kernel_size // 2, 
        
                                  groups=weight.size(0), 
        
                                  bias=bias) 
        
                   out = out.transpose(0, 1) 
        
                   # b, 1, 104, 104 
        
                   return out

	class Projector(nn.Module):
	def __init__(self, word_dim=1024, in_dim=256, kernel_size=3):
	super().__init__()
	self.in_dim = in_dim
	self.kernel_size = kernel_size
	# visual projector
	self.vis = nn.Sequential( # os16 -> os4
	nn.Upsample(scale_factor=2, mode='bilinear'),
	conv_layer(in_dim * 2, in_dim * 2, 3, padding=1),
	nn.Upsample(scale_factor=2, mode='bilinear'),
	conv_layer(in_dim * 2, in_dim, 3, padding=1),
	nn.Conv2d(in_dim, in_dim, 1))
	# textual projector
	out_dim = 1 * in_dim * kernel_size * kernel_size + 1
	self.txt = nn.Linear(word_dim, out_dim)

	def forward(self, x, word):
	'''
	x: b, 512, 26, 26
	word: b, 512
	'''
	x = self.vis(x)
	B, C, H, W = x.size()
	# 1, b*256, 104, 104
	x = x.reshape(1, B * C, H, W)
	# txt: b, (25633 + 1) -> b, 256, 3, 3 / b
	word = self.txt(word)
	weight, bias = word[:, :-1], word[:, -1]
	weight = weight.reshape(B, C, self.kernel_size, self.kernel_size)
	# Conv2d - 1, b*256, 104, 104 -> 1, b, 104, 104
	out = F.conv2d(x,
	weight,
	padding=self.kernel_size // 2,
	groups=weight.size(0),
	bias=bias)
	out = out.transpose(0, 1)
	# b, 1, 104, 104
	return out

About loss function #5

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions