Skip to content

Commit 3f483be

Browse files
ArthurZuckerarthur@huggingface.coRocketknight1
authored
[PixtralLarge] Update Pixtral conversion script to support large format! (#34801)
* update conversion script * update for bias again * remove pdv * use my dir * Update how we initialize the tokenizer * Convert in bfloat16 * Undo that one again * fix config dump * .to() was broken for BatchMixFeature * quick debug breakpoint * put the breakpoint in the right place * Add a config flag for the multimodal projector bias * Add a config flag for the multimodal projector bias * Conversion script can load chat templates * Indent config for comparison * Stop clobbering the config * Re-enable the config clobber * Get rid of the config manual save - it has no effect! * Handle adapter bias correctly * Default vision transformer activation to silu * Remove legacy processing path * One commit with all the debug breakpoints before I delete them all, in case I need to revert * Update conversion * Remove vLLM debugging instrumentation * Drop xformers * Remove debug enumerates * make fixup * make fixup * Break copied from in pixtral * Propagate multimodal_projector_bias change * Propagate multimodal_projector_bias change * Remove debug device .to() * Restore attention weights output * Fix Pixtral test * Drop image_seq_length * Drop image_seq_length * Put the legacy processing code back * Add the bias option to the llava_next_video config * Add the bias option to the llava_next_video config * Make certain args required in converter * Make certain args required in converter * typo * make fixup * Reverting some dtype changes since it seems to work without them --------- Co-authored-by: arthur@huggingface.co <arthur@ip-26-0-166-244.ec2.internal> Co-authored-by: Matt <rocketknight1@gmail.com> Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
1 parent 4c2c12b commit 3f483be

16 files changed

+200
-115
lines changed

src/transformers/models/llava/configuration_llava.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class LlavaConfig(PretrainedConfig):
5050
The index of the layer to select the vision feature.
5151
image_seq_length (`int`, *optional*, defaults to 576):
5252
Sequence length of one image embedding.
53+
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
54+
Whether to use bias in the multimodal projector.
5355
5456
Example:
5557
@@ -85,6 +87,7 @@ def __init__(
8587
vision_feature_select_strategy="default",
8688
vision_feature_layer=-2,
8789
image_seq_length=576,
90+
multimodal_projector_bias=True,
8891
**kwargs,
8992
):
9093
self.ignore_index = ignore_index
@@ -127,6 +130,7 @@ def __init__(
127130
text_config = CONFIG_MAPPING["llama"]()
128131

129132
self.text_config = text_config
133+
self.multimodal_projector_bias = multimodal_projector_bias
130134

131135
super().__init__(**kwargs)
132136

src/transformers/models/llava/modeling_llava.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,13 @@ class LlavaCausalLMOutputWithPast(ModelOutput):
8686
class LlavaMultiModalProjector(nn.Module):
8787
def __init__(self, config: LlavaConfig):
8888
super().__init__()
89-
90-
self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
89+
self.linear_1 = nn.Linear(
90+
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
91+
)
9192
self.act = ACT2FN[config.projector_hidden_act]
92-
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
93+
self.linear_2 = nn.Linear(
94+
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
95+
)
9396

9497
def forward(self, image_features):
9598
hidden_states = self.linear_1(image_features)

src/transformers/models/llava_next/configuration_llava_next.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ class LlavaNextConfig(PretrainedConfig):
5555
Whether the model's input and output word embeddings should be tied.
5656
image_seq_length (`int`, *optional*, defaults to 576):
5757
Sequence length of one image embedding.
58+
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
59+
Whether to use bias in the multimodal projector.
5860
5961
Example:
6062
@@ -92,12 +94,14 @@ def __init__(
9294
image_grid_pinpoints=None,
9395
tie_word_embeddings=False,
9496
image_seq_length=576,
97+
multimodal_projector_bias=True,
9598
**kwargs,
9699
):
97100
self.ignore_index = ignore_index
98101
self.image_token_index = image_token_index
99102
self.projector_hidden_act = projector_hidden_act
100103
self.image_seq_length = image_seq_length
104+
self.multimodal_projector_bias = multimodal_projector_bias
101105

102106
if vision_feature_select_strategy not in ["default", "full"]:
103107
raise ValueError(

src/transformers/models/llava_next/modeling_llava_next.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,13 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput):
194194
class LlavaNextMultiModalProjector(nn.Module):
195195
def __init__(self, config: LlavaNextConfig):
196196
super().__init__()
197-
198-
self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
197+
self.linear_1 = nn.Linear(
198+
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
199+
)
199200
self.act = ACT2FN[config.projector_hidden_act]
200-
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
201+
self.linear_2 = nn.Linear(
202+
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
203+
)
201204

202205
def forward(self, image_features):
203206
hidden_states = self.linear_1(image_features)

src/transformers/models/llava_next_video/configuration_llava_next_video.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ class LlavaNextVideoConfig(PretrainedConfig):
4444
The image token index to encode the image prompt.
4545
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
4646
The activation function used by the multimodal projector.
47+
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
48+
Whether to use bias in the multimodal projector.
4749
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
4850
The feature selection strategy used to select the vision feature from the vision backbone.
4951
Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
@@ -95,6 +97,7 @@ def __init__(
9597
ignore_index=-100,
9698
image_token_index=32001,
9799
projector_hidden_act="gelu",
100+
multimodal_projector_bias=True,
98101
vision_feature_select_strategy="default",
99102
vision_feature_layer=-2,
100103
image_grid_pinpoints=None,
@@ -114,6 +117,7 @@ def __init__(
114117
self.ignore_index = ignore_index
115118
self.image_token_index = image_token_index
116119
self.projector_hidden_act = projector_hidden_act
120+
self.multimodal_projector_bias = multimodal_projector_bias
117121

118122
if vision_feature_select_strategy not in ["default", "full"]:
119123
raise ValueError(

src/transformers/models/llava_next_video/modeling_llava_next_video.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,10 +179,13 @@ def _init_weights(self, module):
179179
class LlavaNextVideoMultiModalProjector(nn.Module):
180180
def __init__(self, config: LlavaNextVideoConfig):
181181
super().__init__()
182-
183-
self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
182+
self.linear_1 = nn.Linear(
183+
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
184+
)
184185
self.act = ACT2FN[config.projector_hidden_act]
185-
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
186+
self.linear_2 = nn.Linear(
187+
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
188+
)
186189

187190
def forward(self, image_features):
188191
hidden_states = self.linear_1(image_features)

src/transformers/models/llava_next_video/modular_llava_next_video.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ class LlavaNextVideoConfig(PretrainedConfig):
5858
The image token index to encode the image prompt.
5959
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
6060
The activation function used by the multimodal projector.
61+
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
62+
Whether to use bias in the multimodal projector.
6163
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
6264
The feature selection strategy used to select the vision feature from the vision backbone.
6365
Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
@@ -109,6 +111,7 @@ def __init__(
109111
ignore_index=-100,
110112
image_token_index=32001,
111113
projector_hidden_act="gelu",
114+
multimodal_projector_bias=True,
112115
vision_feature_select_strategy="default",
113116
vision_feature_layer=-2,
114117
image_grid_pinpoints=None,
@@ -128,6 +131,7 @@ def __init__(
128131
self.ignore_index = ignore_index
129132
self.image_token_index = image_token_index
130133
self.projector_hidden_act = projector_hidden_act
134+
self.multimodal_projector_bias = multimodal_projector_bias
131135

132136
if vision_feature_select_strategy not in ["default", "full"]:
133137
raise ValueError(

src/transformers/models/llava_onevision/configuration_llava_onevision.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ class LlavaOnevisionConfig(PretrainedConfig):
5858
of the form `(height, width)`.
5959
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
6060
Whether the model's input and output word embeddings should be tied.
61+
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
62+
Whether to use bias in the multimodal projector.
6163
6264
Example:
6365
@@ -95,11 +97,13 @@ def __init__(
9597
vision_aspect_ratio="anyres_max_9",
9698
image_grid_pinpoints=None,
9799
tie_word_embeddings=False,
100+
multimodal_projector_bias=True,
98101
**kwargs,
99102
):
100103
self.image_token_index = image_token_index
101104
self.video_token_index = video_token_index
102105
self.projector_hidden_act = projector_hidden_act
106+
self.multimodal_projector_bias = multimodal_projector_bias
103107

104108
if vision_feature_select_strategy not in ["default", "full"]:
105109
raise ValueError(

src/transformers/models/llava_onevision/modeling_llava_onevision.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,13 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
201201
class LlavaOnevisionMultiModalProjector(nn.Module):
202202
def __init__(self, config: LlavaOnevisionConfig):
203203
super().__init__()
204-
205-
self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
204+
self.linear_1 = nn.Linear(
205+
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
206+
)
206207
self.act = ACT2FN[config.projector_hidden_act]
207-
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
208+
self.linear_2 = nn.Linear(
209+
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
210+
)
208211

209212
def forward(self, image_features):
210213
hidden_states = self.linear_1(image_features)

0 commit comments

Comments
 (0)