|
56 | 56 |
|
57 | 57 | from transformers import (
|
58 | 58 | AutoModelForCausalLM,
|
| 59 | + AutoModelForImageTextToText, |
59 | 60 | AutoModelForSeq2SeqLM,
|
60 | 61 | AutoModelForSpeechSeq2Seq,
|
61 | 62 | AutoModelForVision2Seq,
|
@@ -4720,6 +4721,60 @@ def test_generate_vision2text_conditioning(self):
|
4720 | 4721 | self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
|
4721 | 4722 | self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
|
4722 | 4723 |
|
| 4724 | + @slow |
| 4725 | + @require_torch_gpu |
| 4726 | + def test_cache_device_map_with_vision_layer_device_map(self): |
| 4727 | + """ |
| 4728 | + Test that the cache device map is correctly set when the vision layer has a device map. Regression test for |
| 4729 | + #36942 |
| 4730 | + """ |
| 4731 | + # gemma 3 uses hybrid cache, which can be compiled -> needs a device map at allocation time |
| 4732 | + model_id = "google/gemma-3-4b-it" |
| 4733 | + |
| 4734 | + # important part of this device map: the `.layers.` pattern is NOT present in the decoder |
| 4735 | + device_map = { |
| 4736 | + "vision_tower.vision_model.embeddings": 0, |
| 4737 | + "vision_tower.vision_model.encoder.layers.0": 0, |
| 4738 | + "vision_tower.vision_model.encoder.layers.1": 0, |
| 4739 | + "vision_tower.vision_model.encoder.layers.2": 0, |
| 4740 | + "vision_tower.vision_model.encoder.layers.3": 0, |
| 4741 | + "vision_tower.vision_model.encoder.layers.4": 0, |
| 4742 | + "vision_tower.vision_model.encoder.layers.5": 0, |
| 4743 | + "vision_tower.vision_model.encoder.layers.6": 0, |
| 4744 | + "vision_tower.vision_model.encoder.layers.7": 0, |
| 4745 | + "vision_tower.vision_model.encoder.layers.8": 0, |
| 4746 | + "vision_tower.vision_model.encoder.layers.9": 0, |
| 4747 | + "vision_tower.vision_model.encoder.layers.10": 0, |
| 4748 | + "vision_tower.vision_model.encoder.layers.11": 0, |
| 4749 | + "vision_tower.vision_model.encoder.layers.12": 0, |
| 4750 | + "vision_tower.vision_model.encoder.layers.13": 0, |
| 4751 | + "vision_tower.vision_model.encoder.layers.14": "cpu", |
| 4752 | + "vision_tower.vision_model.encoder.layers.15": "cpu", |
| 4753 | + "vision_tower.vision_model.encoder.layers.16": "cpu", |
| 4754 | + "vision_tower.vision_model.encoder.layers.17": "cpu", |
| 4755 | + "vision_tower.vision_model.encoder.layers.18": "cpu", |
| 4756 | + "vision_tower.vision_model.encoder.layers.19": "cpu", |
| 4757 | + "vision_tower.vision_model.encoder.layers.20": "cpu", |
| 4758 | + "vision_tower.vision_model.encoder.layers.21": "cpu", |
| 4759 | + "vision_tower.vision_model.encoder.layers.22": "cpu", |
| 4760 | + "vision_tower.vision_model.encoder.layers.23": "cpu", |
| 4761 | + "vision_tower.vision_model.encoder.layers.24": "cpu", |
| 4762 | + "vision_tower.vision_model.encoder.layers.25": "cpu", |
| 4763 | + "vision_tower.vision_model.encoder.layers.26": "cpu", |
| 4764 | + "vision_tower.vision_model.post_layernorm": "cpu", |
| 4765 | + "multi_modal_projector": "cpu", |
| 4766 | + "language_model": "cpu", |
| 4767 | + } |
| 4768 | + |
| 4769 | + model = AutoModelForImageTextToText.from_pretrained( |
| 4770 | + model_id, device_map=device_map, torch_dtype=torch.bfloat16 |
| 4771 | + ) |
| 4772 | + tokenizer = AutoTokenizer.from_pretrained(model_id) |
| 4773 | + inputs = tokenizer(["This is a text input"], return_tensors="pt").to(model.device) |
| 4774 | + |
| 4775 | + # If the generate doesn't infer the DECODER device map correctly, this will fail |
| 4776 | + _ = model.generate(**inputs, max_new_tokens=2, do_sample=False) |
| 4777 | + |
4723 | 4778 |
|
4724 | 4779 | @require_torch
|
4725 | 4780 | class TokenHealingTestCase(unittest.TestCase):
|
|
0 commit comments