2024-11-25 18:23:57 +01:00
8 changed files with 65 additions and 117 deletions
--- a/api/after_request.py
+++ b/api/after_request.py
@ -2,7 +2,7 @@ from db import logs, stats, users
 from helpers import network

 async def after_request(
-    incoming_request,
+    incoming_request: dict,
    target_request: dict,
    user: dict,
    tokens: dict,
--- a/api/handler.py
+++ b/api/handler.py
@ -62,7 +62,7 @@ async def handle(incoming_request: fastapi.Request):
    user = await users.user_by_api_key(received_key.split('Bearer ')[1].strip())

    if not user or not user['status']['active']:
-        return await errors.error(401, 'Invalid or inactive NovaAI API key!', 'Try /resetkey or /credentials.')
+        return await errors.error(418, 'Invalid or inactive NovaAI API key!', 'Create a new NovaOSS API key or reactivate your account.')

    ban_reason = user['status']['ban_reason']
    if ban_reason:
@ -118,10 +118,7 @@ async def handle(incoming_request: fastapi.Request):

                for message in payload.get('messages', []):
                    if message.get('role') == 'user':
-                        try:
-                            inp += message.get('content', '') + '\n'
-                        except TypeError:
-                            inp += message['content'][0]['text'] + '\n'
+                        inp += message.get('content', '') + '\n'

            if 'functions' in payload:
                inp += '\n'.join([function.get('description', '') for function in payload.get('functions', [])])
--- a/api/main.py
+++ b/api/main.py
@ -1,10 +1,8 @@
 """FastAPI setup."""

 import os
-import ujson
 import fastapi
 import pydantic
-import responder

 from dotenv import load_dotenv

@ -79,13 +77,6 @@ async def v1_handler(request: fastapi.Request):
    res = await handler.handle(incoming_request=request)
    return res

-@app.route('/update-v1-models', methods=['GET'])
-async def update_v1_models(request: fastapi.Request):
-    res = []
-    async for response in responder.respond(path='/v1/models', overwrite_method='GET'):
-        res.append(response)
-    return res
-
@limiter.limit('100/minute', '1000/hour') 
@app.route('/enterprise/v1/{path:path}', methods=['GET', 'POST', 'PUT', 'DELETE', 'PATCH'])
 async def enterprise_handler(request: fastapi.Request):
--- a/api/providers/init.py
+++ b/api/providers/init.py
@ -1,2 +1,2 @@
-from . import closed, closed4, azure
-MODULES =    [closed, closed4, azure]
+from . import ails, closed, closed4
+MODULES =    [closed, closed4]
--- a/api/providers/main.py
+++ b/api/providers/main.py
@ -1,5 +1,3 @@
-"""CLI Tool"""
-
 import os
 import sys
 import aiohttp
@ -24,15 +22,10 @@ async def main():

        for file_name in os.listdir(os.path.dirname(__file__)):
            if file_name.endswith('.py') and not file_name.startswith('_'):
-                model_name = file_name.split('.')[0]
+                name = file_name.split('.')[0]
                models = importlib.import_module(f'.{file_name.split(".")[0]}', 'providers').MODELS
-
-                text = ''
-
-                for model in models:
-                    text += f'  - {model}\n'
-
-                print(f'  {model_name}:\n{text}')
+                
+                print(f'  {name} @ {", ".join(models)}')

        sys.exit(0)

@ -42,7 +35,7 @@ async def main():
        print(exc)
        sys.exit(1)

-    if len(sys.argv) == 3:
+    if len(sys.argv) > 2:
        model = sys.argv[2] # choose a specific model
    else:
        model = provider.MODELS[-1] # choose best model
--- a/api/providers/helpers/utils.py
+++ b/api/providers/helpers/utils.py
@ -6,26 +6,17 @@ except ModuleNotFoundError:
 # Sort the models by their value/cost/rarity.

 GPT_3 = [
-    'dall-e-2',
-    'code-davinci-002',
-    'text-davinci-002',
-    'text-davinci-003',
-
    'gpt-3.5-turbo',
    'gpt-3.5-turbo-16k',
    'gpt-3.5-turbo-0613',
    'gpt-3.5-turbo-0301',
    'gpt-3.5-turbo-16k-0613',
-
-    'gpt-3.5-turbo-instruct',
 ]

 GPT_4 = GPT_3 + [
    'gpt-4',
    'gpt-4-0314',
    'gpt-4-0613',
-    'gpt-4-1106-preview',
-    'gpt-4-vision-preview'
 ]

 GPT_4_32K = GPT_4 + [
--- a/api/responder.py
+++ b/api/responder.py
@ -22,8 +22,6 @@ from helpers.tokens import count_tokens_for_messages

 load_dotenv()

-RETRIES = 10
-
 CRITICAL_API_ERRORS = ['invalid_api_key', 'account_deactivated']
 keymanager = providerkeys.manager
 background_tasks: Set[asyncio.Task[Any]] = set()
@ -32,7 +30,7 @@ with open(os.path.join('config', 'config.yml'), encoding='utf8') as f:
    config = yaml.safe_load(f)

 def create_background_task(coro: Coroutine[Any, Any, Any]) -> None:
-    """Utilizes asyncio.create_task, which prevents the task from being garbage collected.
+    """asyncio.create_task, which prevents the task from being garbage collected.

    https://docs.python.org/3/library/asyncio-task.html#asyncio.create_task
    """
@ -44,8 +42,7 @@ async def respond(
    path: str='/v1/chat/completions',
    user: dict=None,
    payload: dict=None,
-    incoming_request=None,
-    overwrite_method=None
+    incoming_request: starlette.requests.Request=None,
 ):
    """
    Stream the completions request. Sends data in chunks
@ -78,27 +75,17 @@ async def respond(
    input_tokens = 0
    output_tokens = 0

-    if incoming_request:
-        cookies = incoming_request.cookies
-    else:
-        cookies = {}
-        
-    if overwrite_method:
-        method = overwrite_method
-    else:
-        method = incoming_request.method
-
-    for _ in range(RETRIES):
+    for _ in range(10):
        try:
            if is_chat:
                target_request = await load_balancing.balance_chat_request(payload)
            else:
                target_request = await load_balancing.balance_organic_request({
-                    'method': method,
+                    'method': incoming_request.method,
                    'path': path,
                    'payload': payload,
                    'headers': headers,
-                    'cookies': cookies
+                    'cookies': incoming_request.cookies
                })

        except ValueError:
@ -194,7 +181,7 @@ async def respond(
                        chunk_no = 0
                        buffer = ''

-                        async for chunk in response.content.iter_any():
+                        async for chunk in response.content.iter_chunked(1024):
                            chunk_no += 1

                            chunk = chunk.decode('utf8')
@ -205,21 +192,16 @@ async def respond(
                                if not chunk.strip() or chunk_no == 1:
                                    continue

-                            buffer += chunk
-                            while '\n\n' in buffer:
-                                subchunk, buffer = buffer.split('\n\n', 1)
-
-                                if not subchunk.strip():
-                                    continue
+                            subchunks = chunk.split('\n\n')
+                            buffer += subchunks[0]

+                            for subchunk in [buffer] + subchunks[1:-1]:
                                if not subchunk.startswith('data: '):
                                    subchunk = 'data: ' + subchunk

-                                subchunk = subchunk.rsplit('[DONE]', 1)[0]
-                                subchunk += '\n\n'
-
-                                yield subchunk
+                                yield subchunk + '\n\n'

+                            buffer = subchunks[-1]

                        output_tokens = chunk_no
                    break
@ -227,12 +209,10 @@ async def respond(
            except aiohttp.client_exceptions.ServerTimeoutError:
                skipped_errors['timeout'] += 1
                continue
-    else:
-        skipped_errors = {k: v for k, v in skipped_errors.items() if ((isinstance(v, int) and v > 0) or (isinstance(v, list) and len(v) > 0))}
-        skipped_errors['model'] = model
-        skipped_errors['provider'] = provider_name
-        print(f'[!] Skipped {RETRIES} errors:\n{skipped_errors}')

+    else:
+        skipped_errors = {k: v for k, v in skipped_errors.items() if ((isinstance(v, int) and v > 0) or
+                                                                      (isinstance(v, list) and len(v) > 0))}
        skipped_errors = ujson.dumps(skipped_errors, indent=4)
        yield await errors.yield_error(500,
            f'Sorry, our API seems to have issues connecting to "{model}".',
@ -241,51 +221,52 @@ async def respond(
        return

    if (not is_stream) and server_json_response:
-        server_json_response['system_fingerprint'] = f'fp_' + os.urandom(5).hex()
        yield json.dumps(server_json_response)

-    if incoming_request: # not called by other code, but actually a request
-        role = user.get('role', 'default')
-        model_multipliers = config['costs']
-        model_multiplier = model_multipliers['other']
+    role = user.get('role', 'default')

-        if is_chat:
-            model_multiplier = model_multipliers['chat-models'].get(payload.get('model'), model_multiplier)
-            total_tokens = input_tokens + output_tokens
-            credits_cost = total_tokens / 60
-            credits_cost = round(credits_cost * model_multiplier)
+    model_multipliers = config['costs']
+    model_multiplier = model_multipliers['other']

-            if credits_cost < 1:
-                credits_cost = 1
+    if is_chat:
+        model_multiplier = model_multipliers['chat-models'].get(payload.get('model'), model_multiplier)
+        total_tokens = input_tokens + output_tokens
+        credits_cost = total_tokens / 60
+        credits_cost = round(credits_cost * model_multiplier)

-            tokens = {'input': input_tokens, 'output': output_tokens, 'total': total_tokens}
+        if credits_cost < 1:
+            credits_cost = 1

-        elif model == 'dall-e-2':
-            credits_cost = 50
-            tokens = {'input': 0,'output': 0,'total': credits_cost}
+        tokens = {
+            'input': input_tokens,
+            'output': output_tokens,
+            'total': total_tokens
+        }
+    else:
+        credits_cost = 5
+        tokens = {
+            'input': 0,
+            'output': 0,
+            'total': credits_cost
+        }

-        elif model == 'dall-e-3':
-            credits_cost = 100
-            tokens = {'input': 0, 'output': 0, 'total': credits_cost}
+    try:
+        role_cost_multiplier = config['roles'][role]['bonus']
+    except KeyError:
+        role_cost_multiplier = 1

-        try:
-            role_cost_multiplier = config['roles'][role]['bonus']
-        except KeyError:
-            role_cost_multiplier = 1
+    credits_cost = round(credits_cost * role_cost_multiplier)

-
-        credits_cost = round(credits_cost * role_cost_multiplier)
-
-        create_background_task(
-            after_request.after_request(
-                provider=provider_name,
-                incoming_request=incoming_request,
-                target_request=target_request,
-                user=user,
-                credits_cost=credits_cost,
-                tokens=tokens,
-                path=path,
-                is_chat=is_chat,
-                model=model,
-            )
+    create_background_task(
+        after_request.after_request(
+            provider=provider_name,
+            incoming_request=incoming_request,
+            target_request=target_request,
+            user=user,
+            credits_cost=credits_cost,
+            tokens=tokens,
+            path=path,
+            is_chat=is_chat,
+            model=model,
        )
+    )
--- a/checks/client.py
+++ b/checks/client.py
@ -56,7 +56,7 @@ async def test_chat_non_stream_gpt4() -> float:
    """Tests non-streamed chat completions with the GPT-4 model."""

    json_data = {
-        'model': 'gpt-4-1106-preview',
+        'model': 'gpt-4',
        'messages': MESSAGES,
        'stream': False
    }
@ -74,8 +74,7 @@ async def test_chat_non_stream_gpt4() -> float:

    try:
        assert '1337' in response.json()['choices'][0]['message']['content'], 'The API did not return a correct response.'
-    except KeyError:
-        print(response.json())
+    except json.decoder.JSONDecodeError:
        return response.status_code

    return time.perf_counter() - request_start
@ -111,11 +110,7 @@ async def test_chat_stream_gpt3() -> float:
                break

            if chunk:
-                try:
-                    chunks.append(json.loads(chunk))
-                except json.decoder.JSONDecodeError:
-                    print(f'[!] Invalid chunk: {chunk}')
-                    return f'Received chunk with invalid JSON. Status code {response.status_code}.'
+                chunks.append(json.loads(chunk))

                try:
                    resulting_text += json.loads(chunk)['choices'][0]['delta']['content']