CoCalc -- inference.py

GitHub Repository: huggingface/notebooks
Path: blob/main/examples/idefics/inference.py
⁵⁹⁰⁶ views
1
# this is a demo of inference of IDEFICS-9B which needs about 20GB of GPU memory
2

3
import torch
4
from transformers import IdeficsForVisionText2Text, AutoProcessor
5

6
device = "cuda" if torch.cuda.is_available() else "cpu"
7

8
checkpoint = "HuggingFaceM4/idefics-9b"
9
#checkpoint = "HuggingFaceM4/tiny-random-idefics"
10

11
model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
12
processor = AutoProcessor.from_pretrained(checkpoint)
13

14
url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
15
image = processor.image_processor.fetch_images(url)
16

17
prompts = [
18
    [
19
        "User:",
20
        image,
21
        "Describe this image.\nAssistant: An image of two kittens in grass.\n",
22
        "User:",
23
        "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
24
        "Describe this image.\nAssistant:",
25
    ],
26
    [
27
        "User:",
28
        "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
29
        "Describe this image.\nAssistant: An image of a dog wearing funny glasses.\n",
30
        "User:",
31
        "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg",
32
        "Describe this image.\nAssistant:",
33
    ],
34
    [
35
        "User:",
36
        image,
37
        "Describe this image.\nAssistant: An image of two kittens in grass.\n",
38
        "User:",
39
        "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg",
40
        "Describe this image.\nAssistant:",
41
    ],
42
    [
43
        "User:",
44
        "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg",
45
        "Describe this image.\nAssistant: An image of a dog.\n",
46
        "User:",
47
        "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg",
48
        "Describe this image.\nAssistant:",
49
    ],
50
]
51

52
# batched mode
53
inputs = processor(prompts, return_tensors="pt").to(device)
54
# single sample mode
55
#inputs = processor(prompts[0], return_tensors="pt").to(device)
56

57
generated_ids = model.generate(**inputs, max_length=128)
58
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
59
for i,t in enumerate(generated_text):
60
    print(f"{i}:\n{t}\n")
61

62
Product

Resources

Company