diff --git a/examples/Raven-RWKV/Dockerfile b/examples/Raven-RWKV/Dockerfile new file mode 100644 index 0000000..f0232a1 --- /dev/null +++ b/examples/Raven-RWKV/Dockerfile @@ -0,0 +1,44 @@ +FROM nvidia/cuda:11.7.1-devel-ubuntu20.04 + +# Update, install +RUN apt-get update && \ + apt-get install -y build-essential ninja-build git wget + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ + rm Miniconda3-latest-Linux-x86_64.sh && \ + /opt/conda/bin/conda create -y --name py39 python=3.9 && \ + /opt/conda/bin/conda clean -ya + +ENV PATH /opt/conda/envs/py39/bin:$PATH + +RUN pip install --upgrade pip setuptools wheel + +# Create user instead of using root +ENV USER='user' +RUN groupadd -r user && useradd -r -g $USER $USER +RUN mkdir -p /home/$USER/app +RUN chown -R $USER:$USER /home/$USER +USER $USER + +# Define workdir +WORKDIR /home/$USER/app + +# Install project +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +COPY get_models.py . + +# Get model weights and tokenizer +RUN python3 get_models.py + +# Copy rest +COPY . . + +# Publish port +EXPOSE 50051:50051 + +# Enjoy +ENTRYPOINT ["python3", "server.py"] +CMD ["--address", "[::]:50051"] \ No newline at end of file diff --git a/examples/Raven-RWKV/README.md b/examples/Raven-RWKV/README.md new file mode 100644 index 0000000..cd72fbb --- /dev/null +++ b/examples/Raven-RWKV/README.md @@ -0,0 +1,48 @@ +# RavenRWKV service + +## Description + +This project uses the [RWKV-LM](https://github.com/BlinkDL/RWKV-LM) model and turns it into an gRPC service that can be used through [SimpleAI](https://github.com/lhenault/simpleAI). + +RWKV is an RNN with Transformer-level language model performance that can be trained like a GPT transformer and is 100% attention-free. It combines the best of RNN and transformer, providing great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding. + +## Usage + +Edit the `MODEL` variable in `get_models.py` to choose the model size and context. + +Edit the `STRATEGY` variable in `lib_raven.py` to decide how the weights will be loaded, play with this to optimise the throughput for your system. See below for a graphic explanation or checkout [ChatRWKV](https://github.com/BlinkDL/ChatRWKV) for more information. + +![Strategies as of 20 Apr 2023](https://raw.githubusercontent.com/BlinkDL/ChatRWKV/536b4b3bf87fbd999798141f409b151ca91a76c7/ChatRWKV-strategy.png) + +## Build + +```bash +docker build . -t raven-rwkv-service:latest +``` + +## Start service + +```bash +docker run -it --rm -p 50051:50051 --gpus all raven-rwkv-service:latest +``` + +## Add to model.toml + +``` +```toml +[raven] + [raven.metadata] + owned_by = 'BlinkDL' + permission = [] + description = 'RWKV fine tuned for instruction answering' + [raven.network] + url = 'localhost:50051' +``` + +``` + +## Credits + +Heavily borrowed from lhenault & BlinkDL + +https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B \ No newline at end of file diff --git a/examples/Raven-RWKV/get_models.py b/examples/Raven-RWKV/get_models.py new file mode 100644 index 0000000..3908388 --- /dev/null +++ b/examples/Raven-RWKV/get_models.py @@ -0,0 +1,60 @@ +from pathlib import Path + +import requests +from huggingface_hub import hf_hub_download + +MODEL = "raven-1b-ctx4096" + +TOKENIZER_PATH = Path(__file__).parent / "20B_tokenizer.json" +models = { + "raven-14b-ctx4096": { + "repo_id": "BlinkDL/rwkv-4-raven", + "title": "RWKV-4-Raven-14B-v8-Eng-20230408-ctx4096", + }, + "raven-7b-ctx4096": { + "repo_id": "BlinkDL/rwkv-4-raven", + "title": "RWKV-4-Raven-7B-v7-Eng-20230404-ctx4096", + }, + "raven-7b-ctx1024": { + "repo_id": "BlinkDL/rwkv-4-pile-7b", + "title": "RWKV-4-Pile-7B-Instruct-test4-20230326", + }, + "rwkv-4-pile-169m": { + "repo_id": "BlinkDL/rwkv-4-pile-169m", + "title": "RWKV-4-Pile-169M-20220807-8023", + }, + "raven-1b-ctx4096": { + "repo_id": "BlinkDL/rwkv-4-raven", + "title": "RWKV-4-Raven-1B5-v11-Eng99%-Other1%-20230425-ctx4096", + }, + "raven-3b-ctx4096": { + "repo_id": "BlinkDL/rwkv-4-raven", + "title": "RWKV-4-Raven-3B-v11-Eng99%-Other1%-20230425-ctx4096", + }, +} + + +def fetch_tokenizer(tokenizer_path: Path): + url = "https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B/raw/main/20B_tokenizer.json" + tokenizer_path.parent.mkdir(exist_ok=True) + + response = requests.get(url) + tokenizer_path.write_bytes(response.content) + + +def get_model_path(model="rwkv-4-pile-169m"): + tokenizer_path = Path(__file__).parent / "20B_tokenizer.json" + if not tokenizer_path.exists(): + fetch_tokenizer(tokenizer_path) + + model_params = models[model] + + model_path = hf_hub_download( + repo_id=model_params["repo_id"], filename=f"{model_params['title']}.pth" + ) + + return model_path + + +if __name__ == "__main__": + get_model_path(MODEL) diff --git a/examples/Raven-RWKV/lib_raven.py b/examples/Raven-RWKV/lib_raven.py new file mode 100644 index 0000000..85d7417 --- /dev/null +++ b/examples/Raven-RWKV/lib_raven.py @@ -0,0 +1,108 @@ +import logging +from typing import List + +from get_models import MODEL, TOKENIZER_PATH, get_model_path + +# if RWKV_CUDA_ON='1' then use CUDA kernel for seq mode (much faster) +# these settings must be configured before attempting to import rwkv +from rwkv.model import RWKV +from rwkv.utils import PIPELINE, PIPELINE_ARGS + +STRATEGIES = { + "streaming": "cuda fp16i8 *40+ -> cpu fp32 *1", # Quite slow, take ~3gb VRAM + "fp16i8": "cuda fp16i8 *40 -> cpu fp32 *1", # fits the 14b on a T4, quite fast + "cpu": "cpu fp32 *1", # requires a lot of RAM +} + +STRATEGY = STRATEGIES["cpu"] + +logger = logging.getLogger(__file__) + +ctx_limit = 4096 + + +def get_model(): + model_path = get_model_path(MODEL) + + model = RWKV(model=model_path, strategy=STRATEGY) # stream mode w/some static + + pipeline = PIPELINE(model, str(TOKENIZER_PATH)) + + return model, pipeline + + +def generate_prompt(instruction, prompt=None): + if prompt: + return f"""Below is an instruction that describes a task, paired with an input"\ + " that provides further context. Write a response that appropriately completes the request. + +# Instruction: +{instruction} + +# Input: +{prompt} + +# Response: +""" + else: + return f"""Below is an instruction that describes a task. Write a response that "\ + "appropriately completes the request. + +# Instruction: +{instruction} + +# Response: +""" + + +def complete( + instruction, + model, + pipeline: PIPELINE, + prompt="", + token_count=200, + temperature=1.0, + top_p=0.7, + presencePenalty=0.1, + countPenalty=0.1, + stop_words=None, +): + args = PIPELINE_ARGS( + temperature=max(0.2, float(temperature)), + top_p=float(top_p), + alpha_frequency=countPenalty, + alpha_presence=presencePenalty, + token_ban=[], # ban the generation of some tokens + token_stop=[0], + stop_words=stop_words, + ) # stop generation whenever you see any token here + + for delta in pipeline.igenerate(ctx=instruction, token_count=token_count, args=args): + yield delta + + +def embedding( + inputs: List[str], + model, + pipeline, + temperature=1.0, # TODO remove + top_p=0.7, + presencePenalty=0.1, + countPenalty=0.1, +): + PIPELINE_ARGS( + temperature=max(0.2, float(temperature)), + top_p=float(top_p), + alpha_frequency=countPenalty, + alpha_presence=presencePenalty, + token_ban=[], # ban the generation of some tokens + token_stop=[0], + ) # stop generation whenever you see any token here + + context = [pipeline.encode(ctx)[-ctx_limit:] for ctx in inputs] + _, state = model.forward(context[0], None) + *_, embedding = state + + if len(embedding.shape) == 1: + embedding = embedding.unsqueeze(0) + return embedding diff --git a/examples/Raven-RWKV/logging.conf b/examples/Raven-RWKV/logging.conf new file mode 100644 index 0000000..894773c --- /dev/null +++ b/examples/Raven-RWKV/logging.conf @@ -0,0 +1,36 @@ +[loggers] +keys=root,uicheckapp + +[handlers] +keys=consoleHandler,detailedConsoleHandler + +[formatters] +keys=normalFormatter,detailedFormatter + +[logger_root] +level=INFO +handlers=consoleHandler + +[logger_uicheckapp] +level=DEBUG +handlers=detailedConsoleHandler +qualname=uicheckapp +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=normalFormatter +args=(sys.stdout,) + +[handler_detailedConsoleHandler] +class=StreamHandler +level=DEBUG +formatter=detailedFormatter +args=(sys.stdout,) + +[formatter_normalFormatter] +format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s + +[formatter_detailedFormatter] +format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s call_trace=%(pathname)s L%(lineno)-4d \ No newline at end of file diff --git a/examples/Raven-RWKV/model.py b/examples/Raven-RWKV/model.py new file mode 100644 index 0000000..6976559 --- /dev/null +++ b/examples/Raven-RWKV/model.py @@ -0,0 +1,138 @@ +import json +import logging +import re +from dataclasses import dataclass + +import lib_raven +import torch +from simple_ai.api.grpc.chat.server import LanguageModel + + +def format_chat_log(chat: list[dict[str, str]] = dict()) -> str: + raw_chat_text = "" + for item in chat: + if item["role"] not in ("user", "assistant"): + continue + role = "Bob" if item.get("role") == "user" else "Alice" + content = item.get("content").strip() + content = re.sub("\n+", "\n", content) + + raw_chat_text += f"{role}: {content}\n\n" + return raw_chat_text + "Alice:" + + +@dataclass(unsafe_hash=True) +class RavenRWKVModel(LanguageModel): + gpu_id: int = 0 + device = torch.device("cuda", gpu_id) if torch.cuda.is_available() else torch.device("cpu") + model, pipeline = lib_raven.get_model() + + def chat( + self, + chatlog: list[list[str]] = None, + max_tokens: int = 512, + temperature: float = 0.9, + top_p: int = 0.5, + presencePenalty: int = 0.4, + countPenalty: int = 0.4, + *args, + **kwargs, + ) -> str: + prompt = format_chat_log(chatlog) + output = lib_raven.complete( + prompt, + self.model, + self.pipeline, + prompt=None, + token_count=max_tokens, + temperature=temperature, + top_p=top_p, + presencePenalty=presencePenalty, + countPenalty=countPenalty, + ) + + output = "".join(output) + + return [{"role": "raven", "content": output}] + + def complete( + self, + *args, + **kwargs, + ) -> str: + output = self.stream_complete(*args, **kwargs) + output = "".join(output) + + return output + + def stream_complete( + self, + prompt: str = None, + max_tokens: int = 512, + temperature: float = 0.9, + top_p: int = 0.5, + presencePenalty: int = 0.4, + countPenalty: int = 0.4, + stop=None, + # *args, + **kwargs, + ) -> str: + stop = json.loads(stop) + output = lib_raven.complete( + prompt, + self.model, + self.pipeline, + token_count=max_tokens, + temperature=temperature, + top_p=top_p, + presencePenalty=presencePenalty, + countPenalty=countPenalty, + stop_words=stop, + ) + yield from output + + def stream( + self, + chatlog: list[list[str]] = None, + max_tokens: int = 512, + temperature: float = 0.9, + top_p: int = 0.5, + presencePenalty: int = 0.4, + countPenalty: int = 0.4, + *args, + **kwargs, + ): + yield [{"role": "raven"}] + + stop_words = ["\n\nBob:", "\n\nAlice:"] + + prompt = format_chat_log(chatlog) + first = True + for delta in lib_raven.complete( + prompt, + self.model, + self.pipeline, + prompt=None, + token_count=max_tokens, + temperature=temperature, + top_p=top_p, + presencePenalty=presencePenalty, + countPenalty=countPenalty, + stop_words=stop_words, + ): + clean_delta = delta + if first: + clean_delta = delta[1:] ## remove leading whitespace in completion + first = False + yield [{"content": clean_delta}] + + def embed( + self, + inputs: list = [], + ) -> list: + logging.info(f"Processing inputs : {inputs}") + embeddings = lib_raven.embedding(inputs, self.model, self.pipeline) + logging.info( + f"Successfully computed embeddings (shape : {embeddings.shape}) for inputs : {inputs}" + ) + return embeddings.tolist() diff --git a/examples/Raven-RWKV/requirements.txt b/examples/Raven-RWKV/requirements.txt new file mode 100644 index 0000000..2d40d61 --- /dev/null +++ b/examples/Raven-RWKV/requirements.txt @@ -0,0 +1,11 @@ +inquirer==3.1.3 +loralib==0.1.1 +ninja==1.11.1 +pynvml==11.5.0 +scipy==1.10.1 +sentencepiece==0.1.97 +simple-ai-server==0.2.0 +tokenizers==0.13.2 +torch==2.0.0 +transformers==4.27.3 +git+https://github.com/Nintorac/ChatRWKV.git@parallel#subdirectory=rwkv_pip_package diff --git a/examples/Raven-RWKV/server.py b/examples/Raven-RWKV/server.py new file mode 100644 index 0000000..d663326 --- /dev/null +++ b/examples/Raven-RWKV/server.py @@ -0,0 +1,55 @@ +import logging +from concurrent import futures + +import grpc +from model import RavenRWKVModel as Model +from simple_ai.api.grpc.chat.server import ( + LanguageModelServicer as ChatServicer, + llm_chat_pb2_grpc, +) +from simple_ai.api.grpc.completion.server import ( + LanguageModelServicer as CompletionServicer, + llm_pb2_grpc, +) +from simple_ai.api.grpc.embedding.server import ( + LanguageModelServicer as EmbeddingServicer, + llm_embed_pb2_grpc, +) + + +def serve( + address="[::]:50051", + chat_servicer=None, + embedding_servicer=None, + completion_servicer=None, + max_workers=10, +): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers)) + llm_chat_pb2_grpc.add_LanguageModelServicer_to_server(chat_servicer, server) + llm_embed_pb2_grpc.add_LanguageModelServicer_to_server(embedding_servicer, server) + llm_pb2_grpc.add_LanguageModelServicer_to_server(completion_servicer, server) + server.add_insecure_port(address=address) + server.start() + server.wait_for_termination() + + +if __name__ == "__main__": + import argparse + + logging.basicConfig(level=logging.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--address", type=str, default="[::]:50051") + args = parser.parse_args() + + logging.info(f"Starting gRPC server on {args.address}") + model = Model() + chat_servicer = ChatServicer(model=Model()) + embedding_servicer = EmbeddingServicer(model=Model()) + completion_servicer = CompletionServicer(model=Model()) + serve( + address=args.address, + chat_servicer=chat_servicer, + embedding_servicer=embedding_servicer, + completion_servicer=completion_servicer, + )