Path: blob/main/examples/idefics/inference_4bit.py
5906 views
# this is a demo of inference of IDEFICS-9B using 4bit-quantization which needs about 7GB of GPU memory1# which makes it possible to run even on Google Colab23import torch4from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig56device = "cuda" if torch.cuda.is_available() else "cpu"78checkpoint = "HuggingFaceM4/idefics-9b"9#checkpoint = "HuggingFaceM4/tiny-random-idefics"1011quantization_config = BitsAndBytesConfig(12load_in_4bit=True,13bnb_4bit_compute_dtype="float16",14)15model = IdeficsForVisionText2Text.from_pretrained(checkpoint, quantization_config=quantization_config, device_map="auto")16processor = AutoProcessor.from_pretrained(checkpoint)1718prompts = [19"Instruction: provide an answer to the question. Use the image to answer.\n",20"https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg",21"Question: What's on the picture? Answer: \n"22]2324inputs = processor(prompts, return_tensors="pt")25generated_ids = model.generate(**inputs, max_length=150)26generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)27print(generated_text[0])282930