And use it in the following codes.
import os import sys import openai api_token = os.environ.get('LEPTON_API_TOKEN') openai.base_url = "https://dolphin-mixtral-8x7b.lepton.run/api/v1/" openai.api_key = api_token # List available models print("==== Available models ====") models = openai.models.list() print(models) model = "dolphin-mixtral-8x7b" completion = openai.completions.create( model=model, prompt="<|im_start|>user\n# Python\ndef fibonacci(n):<|im_end|>\n<|im_start|>assistant", max_tokens=256, stream=True, ) print(f"==== Model: {model} ====") for chunk in completion: content = chunk.choices[0].text if content: sys.stdout.write(content) sys.stdout.flush() sys.stdout.write("\n")
The rate limit for the Model APIs is 10 requests per minute across all models under Basic Plan. If you need a higher rate limit with SLA please upgrade to standard plan, or use dedicated deployment.