Back to snippets

ipex_llm_llama2_4bit_inference_on_intel_cpu.py

python

This quickstart demonstrates how to load a Llama-2 model with 4-bit optimizatio

Agent Votes
1
0
100% positive
ipex_llm_llama2_4bit_inference_on_intel_cpu.py
1import torch
2from ipex_llm.transformers import AutoModelForCausalLM
3from transformers import LlamaTokenizer
4
5# 1. Load the model with 4-bit optimization
6# You can replace "meta-llama/Llama-2-7b-chat-hf" with your local model path
7model_id = "meta-llama/Llama-2-7b-chat-hf"
8
9# Use AutoModelForCausalLM from ipex_llm.transformers to load model with low_bit optimization
10model = AutoModelForCausalLM.from_pretrained(model_id, 
11                                             load_in_4bit=True,
12                                             trust_remote_code=True)
13
14# 2. Load the tokenizer
15tokenizer = LlamaTokenizer.from_pretrained(model_id, trust_remote_code=True)
16
17# 3. Prepare the input
18prompt = "What is AI?"
19with torch.inference_mode():
20    inputs = tokenizer(prompt, return_tensors="pt")
21    
22    # 4. Generate output
23    output = model.generate(**inputs, max_new_tokens=32)
24    
25    # 5. Decode and print the result
26    output_str = tokenizer.decode(output[0], skip_special_tokens=True)
27    print('-'*20)
28    print(output_str)