From f6b0698683fbb757d3289cf7bf0ac239e7c44d61 Mon Sep 17 00:00:00 2001 From: Nintorac Dev Date: Thu, 20 Apr 2023 19:01:53 +1000 Subject: [PATCH 1/2] feature: added RWKV example --- examples/Raven-RWKV/Dockerfile | 44 ++++++ examples/Raven-RWKV/README.md | 48 +++++++ examples/Raven-RWKV/get_models.py | 52 +++++++ examples/Raven-RWKV/lib_raven.py | 202 +++++++++++++++++++++++++++ examples/Raven-RWKV/model.py | 141 +++++++++++++++++++ examples/Raven-RWKV/requirements.txt | 11 ++ examples/Raven-RWKV/server.py | 55 ++++++++ 7 files changed, 553 insertions(+) create mode 100644 examples/Raven-RWKV/Dockerfile create mode 100644 examples/Raven-RWKV/README.md create mode 100644 examples/Raven-RWKV/get_models.py create mode 100644 examples/Raven-RWKV/lib_raven.py create mode 100644 examples/Raven-RWKV/model.py create mode 100644 examples/Raven-RWKV/requirements.txt create mode 100644 examples/Raven-RWKV/server.py diff --git a/examples/Raven-RWKV/Dockerfile b/examples/Raven-RWKV/Dockerfile new file mode 100644 index 0000000..f0232a1 --- /dev/null +++ b/examples/Raven-RWKV/Dockerfile @@ -0,0 +1,44 @@ +FROM nvidia/cuda:11.7.1-devel-ubuntu20.04 + +# Update, install +RUN apt-get update && \ + apt-get install -y build-essential ninja-build git wget + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ + rm Miniconda3-latest-Linux-x86_64.sh && \ + /opt/conda/bin/conda create -y --name py39 python=3.9 && \ + /opt/conda/bin/conda clean -ya + +ENV PATH /opt/conda/envs/py39/bin:$PATH + +RUN pip install --upgrade pip setuptools wheel + +# Create user instead of using root +ENV USER='user' +RUN groupadd -r user && useradd -r -g $USER $USER +RUN mkdir -p /home/$USER/app +RUN chown -R $USER:$USER /home/$USER +USER $USER + +# Define workdir +WORKDIR /home/$USER/app + +# Install project +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +COPY get_models.py . + +# Get model weights and tokenizer +RUN python3 get_models.py + +# Copy rest +COPY . . + +# Publish port +EXPOSE 50051:50051 + +# Enjoy +ENTRYPOINT ["python3", "server.py"] +CMD ["--address", "[::]:50051"] \ No newline at end of file diff --git a/examples/Raven-RWKV/README.md b/examples/Raven-RWKV/README.md new file mode 100644 index 0000000..cd72fbb --- /dev/null +++ b/examples/Raven-RWKV/README.md @@ -0,0 +1,48 @@ +# RavenRWKV service + +## Description + +This project uses the [RWKV-LM](https://github.com/BlinkDL/RWKV-LM) model and turns it into an gRPC service that can be used through [SimpleAI](https://github.com/lhenault/simpleAI). + +RWKV is an RNN with Transformer-level language model performance that can be trained like a GPT transformer and is 100% attention-free. It combines the best of RNN and transformer, providing great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding. + +## Usage + +Edit the `MODEL` variable in `get_models.py` to choose the model size and context. + +Edit the `STRATEGY` variable in `lib_raven.py` to decide how the weights will be loaded, play with this to optimise the throughput for your system. See below for a graphic explanation or checkout [ChatRWKV](https://github.com/BlinkDL/ChatRWKV) for more information. + +![Strategies as of 20 Apr 2023](https://raw.githubusercontent.com/BlinkDL/ChatRWKV/536b4b3bf87fbd999798141f409b151ca91a76c7/ChatRWKV-strategy.png) + +## Build + +```bash +docker build . -t raven-rwkv-service:latest +``` + +## Start service + +```bash +docker run -it --rm -p 50051:50051 --gpus all raven-rwkv-service:latest +``` + +## Add to model.toml + +``` +```toml +[raven] + [raven.metadata] + owned_by = 'BlinkDL' + permission = [] + description = 'RWKV fine tuned for instruction answering' + [raven.network] + url = 'localhost:50051' +``` + +``` + +## Credits + +Heavily borrowed from lhenault & BlinkDL + +https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B \ No newline at end of file diff --git a/examples/Raven-RWKV/get_models.py b/examples/Raven-RWKV/get_models.py new file mode 100644 index 0000000..45c80fe --- /dev/null +++ b/examples/Raven-RWKV/get_models.py @@ -0,0 +1,52 @@ +from pathlib import Path + +import requests +from huggingface_hub import hf_hub_download + +MODEL = "rwkv-4-pile-169m" + +TOKENIZER_PATH = Path(__file__).parent / "20B_tokenizer.json" +models = { + "raven-14b-ctx4096": { + "repo_id": "BlinkDL/rwkv-4-raven", + "title": "RWKV-4-Raven-14B-v8-Eng-20230408-ctx4096", + }, + "raven-7b-ctx4096": { + "repo_id": "BlinkDL/rwkv-4-raven", + "title": "RWKV-4-Raven-7B-v7-Eng-20230404-ctx4096", + }, + "raven-7b-ctx1024": { + "repo_id": "BlinkDL/rwkv-4-pile-7b", + "title": "RWKV-4-Pile-7B-Instruct-test4-20230326", + }, + "rwkv-4-pile-169m": { + "repo_id": "BlinkDL/rwkv-4-pile-169m", + "title": "RWKV-4-Pile-169M-20220807-8023", + }, +} + + +def fetch_tokenizer(tokenizer_path: Path): + url = "https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B/raw/main/20B_tokenizer.json" + tokenizer_path.parent.mkdir(exist_ok=True) + + response = requests.get(url) + tokenizer_path.write_bytes(response.content) + + +def get_model_path(model="rwkv-4-pile-169m"): + tokenizer_path = Path(__file__).parent / "20B_tokenizer.json" + if not tokenizer_path.exists(): + fetch_tokenizer(tokenizer_path) + + model_params = models[model] + + model_path = hf_hub_download( + repo_id=model_params["repo_id"], filename=f"{model_params['title']}.pth" + ) + + return model_path + + +if __name__ == "__main__": + get_model_path(MODEL) diff --git a/examples/Raven-RWKV/lib_raven.py b/examples/Raven-RWKV/lib_raven.py new file mode 100644 index 0000000..178b5cb --- /dev/null +++ b/examples/Raven-RWKV/lib_raven.py @@ -0,0 +1,202 @@ +import gc +import logging +import os +from typing import List + +import torch +from get_models import MODEL, TOKENIZER_PATH, get_model_path +from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit + +# if RWKV_CUDA_ON='1' then use CUDA kernel for seq mode (much faster) +# these settings must be configured before attempting to import rwkv +os.environ["RWKV_JIT_ON"] = "1" +os.environ["RWKV_CUDA_ON"] = "1" +from rwkv.model import RWKV # noqa: E402 +from rwkv.utils import PIPELINE, PIPELINE_ARGS # noqa: E402 + +STRATEGIES = { + "streaming": "cuda fp16i8 *40+ -> cpu fp32 *1", # Quite slow, take ~3gb VRAM + "fp16i8": "cuda fp16i8 *40 -> cpu fp32 *1", # fits the 14b on a T4, quite fast + "cpu": "cpu fp32 *1", # requires a lot of RAM +} + +STRATEGY = STRATEGIES["streaming"] + +logger = logging.getLogger(__file__) + +nvmlInit() +gpu_h = nvmlDeviceGetHandleByIndex(0) +ctx_limit = 4096 + + +def get_model(): + model_path = get_model_path(MODEL) + + model = RWKV( + model=model_path, strategy="cuda fp16i8 *40 -> cuda fp16i8 *0+ -> cpu fp32 *1" + ) # stream mode w/some static + + pipeline = PIPELINE(model, str(TOKENIZER_PATH)) + + return model, pipeline + + +def generate_prompt(instruction, prompt=None): + if prompt: + return f"""Below is an instruction that describes a task, paired with an input"\ + " that provides further context. Write a response that appropriately completes the request. + +# Instruction: +{instruction} + +# Input: +{prompt} + +# Response: +""" + else: + return f"""Below is an instruction that describes a task. Write a response that "\ + "appropriately completes the request. + +# Instruction: +{instruction} + +# Response: +""" + + +def chat( + instruction, + model, + pipeline, + prompt="", + token_count=200, + temperature=1.0, + top_p=0.7, + presencePenalty=0.1, + countPenalty=0.1, +): + args = PIPELINE_ARGS( + temperature=max(0.2, float(temperature)), + top_p=float(top_p), + alpha_frequency=countPenalty, + alpha_presence=presencePenalty, + token_ban=[], # ban the generation of some tokens + token_stop=[0], + ) # stop generation whenever you see any token here + + ctx = instruction + + gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) + logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}") + + all_tokens = [] + out_last = 0 + out_str = "" + occurrence = {} + state = None + token = None + for i in range(int(token_count)): + out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state) + for n in occurrence: + out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency + + token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p) + if token in args.token_stop: + break + all_tokens += [token] + if token not in occurrence: + occurrence[token] = 1 + else: + occurrence[token] += 1 + + tmp = pipeline.decode(all_tokens[out_last:]) + if "\ufffd" not in tmp: + out_str += tmp + yield tmp + out_last = i + 1 + gc.collect() + torch.cuda.empty_cache() + + +def embedding( + inputs: List[str], + model, + pipeline, + temperature=1.0, # TODO remove + top_p=0.7, + presencePenalty=0.1, + countPenalty=0.1, +): + PIPELINE_ARGS( + temperature=max(0.2, float(temperature)), + top_p=float(top_p), + alpha_frequency=countPenalty, + alpha_presence=presencePenalty, + token_ban=[], # ban the generation of some tokens + token_stop=[0], + ) # stop generation whenever you see any token here + + gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) + logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}") + + context = [pipeline.encode(ctx)[-ctx_limit:] for ctx in inputs] + _, state = model.forward(context[0], None) + *_, embedding = state + + if len(embedding.shape) == 1: + embedding = embedding.unsqueeze(0) + return embedding + + +def complete( + prompt, + model, + pipeline, + token_count=200, + temperature=1.0, + top_p=0.7, + presencePenalty=0.1, + countPenalty=0.1, +): + args = PIPELINE_ARGS( + temperature=max(0.2, float(temperature)), + top_p=float(top_p), + alpha_frequency=countPenalty, + alpha_presence=presencePenalty, + token_ban=[], # ban the generation of some tokens + token_stop=[0], + ) # stop generation whenever you see any token here + + ctx = prompt + + gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) + logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}") + + all_tokens = [] + out_last = 0 + out_str = "" + occurrence = {} + state = None + token = None + for i in range(int(token_count)): + out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state) + for n in occurrence: + out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency + + token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p) + if token in args.token_stop: + break + all_tokens += [token] + if token not in occurrence: + occurrence[token] = 1 + else: + occurrence[token] += 1 + + tmp = pipeline.decode(all_tokens[out_last:]) + if "\ufffd" not in tmp: + out_str += tmp + yield tmp + out_last = i + 1 + gc.collect() + torch.cuda.empty_cache() diff --git a/examples/Raven-RWKV/model.py b/examples/Raven-RWKV/model.py new file mode 100644 index 0000000..04ae60b --- /dev/null +++ b/examples/Raven-RWKV/model.py @@ -0,0 +1,141 @@ +import logging +from dataclasses import dataclass + +import lib_raven +import torch +from simple_ai.api.grpc.chat.server import LanguageModel +from simple_ai.utils import format_chat_log + + +def endOverlap(a, b): + for i in range(1, len(a) + 1): + if b.startswith(a[-i:]): + return i + return 0 + + +@dataclass(unsafe_hash=True) +class RavenRWKVModel(LanguageModel): + gpu_id: int = 0 + device = torch.device("cuda", gpu_id) if torch.cuda.is_available() else torch.device("cpu") + model, pipeline = lib_raven.get_model() + + def chat( + self, + chatlog: list[list[str]] = None, + max_tokens: int = 512, + temperature: float = 0.9, + top_p: int = 0.5, + presencePenalty: int = 0.4, + countPenalty: int = 0.4, + *args, + **kwargs, + ) -> str: + prompt = format_chat_log(chatlog) + output = lib_raven.chat( + prompt, + self.model, + self.pipeline, + prompt=None, + token_count=max_tokens, + temperature=temperature, + top_p=top_p, + presencePenalty=presencePenalty, + countPenalty=countPenalty, + ) + + output = "".join(output) + + return [{"role": "raven", "content": output}] + + def complete( + self, + *args, + **kwargs, + ) -> str: + output = self.stream_complete(*args, **kwargs) + output = "".join(output) + + return output + + def stream_complete( + self, + prompt: str = None, + max_tokens: int = 512, + temperature: float = 0.9, + top_p: int = 0.5, + presencePenalty: int = 0.4, + countPenalty: int = 0.4, + *args, + **kwargs, + ) -> str: + output = lib_raven.complete( + prompt, + self.model, + self.pipeline, + token_count=max_tokens, + temperature=temperature, + top_p=top_p, + presencePenalty=presencePenalty, + countPenalty=countPenalty, + ) + yield from output + + def stream( + self, + chatlog: list[list[str]] = None, + max_tokens: int = 512, + temperature: float = 0.9, + top_p: int = 0.5, + presencePenalty: int = 0.4, + countPenalty: int = 0.4, + *args, + **kwargs, + ): + yield [{"role": "raven"}] + + stop_words = set([f"{message['role']}:" for message in chatlog]) + + prompt = format_chat_log(chatlog) + chunk = "" + for delta in lib_raven.chat( + prompt, + self.model, + self.pipeline, + prompt=None, + token_count=max_tokens, + temperature=temperature, + top_p=top_p, + presencePenalty=presencePenalty, + countPenalty=countPenalty, + ): + chunk = chunk + delta + longest_stopword = max(map(len, stop_words)) + + if start_idx := max(map(lambda stop_word: endOverlap(chunk, stop_word), stop_words)): + if start_idx > longest_stopword: + start_idx = longest_stopword # can no longer be a stopword so cut it down + good, chunk = chunk[:-start_idx], chunk[-start_idx:] + + if good: + yield [{"content": good}] + + if any(map(lambda stop_word: chunk.startswith(stop_word), stop_words)): + return + continue + + # if start_idx:=max(map(lambda stop_word: endOverlap(stop_word, chunk), stop_words))>0: + + yield [{"content": chunk}] + chunk = "" + + def embed( + self, + inputs: list = [], + ) -> list: + logging.info(f"Processing inputs : {inputs}") + embeddings = lib_raven.embedding(inputs, self.model, self.pipeline) + logging.info( + f"Successfully computed embeddings (shape : {embeddings.shape}) for inputs : {inputs}" + ) + return embeddings.tolist() diff --git a/examples/Raven-RWKV/requirements.txt b/examples/Raven-RWKV/requirements.txt new file mode 100644 index 0000000..ab1377a --- /dev/null +++ b/examples/Raven-RWKV/requirements.txt @@ -0,0 +1,11 @@ +inquirer==3.1.3 +loralib==0.1.1 +ninja==1.11.1 +pynvml==11.5.0 +rwkv==0.6.2 +scipy==1.10.1 +sentencepiece==0.1.97 +simple-ai-server==0.2.0 +tokenizers==0.13.2 +torch==2.0.0 +transformers==4.27.3 diff --git a/examples/Raven-RWKV/server.py b/examples/Raven-RWKV/server.py new file mode 100644 index 0000000..d663326 --- /dev/null +++ b/examples/Raven-RWKV/server.py @@ -0,0 +1,55 @@ +import logging +from concurrent import futures + +import grpc +from model import RavenRWKVModel as Model +from simple_ai.api.grpc.chat.server import ( + LanguageModelServicer as ChatServicer, + llm_chat_pb2_grpc, +) +from simple_ai.api.grpc.completion.server import ( + LanguageModelServicer as CompletionServicer, + llm_pb2_grpc, +) +from simple_ai.api.grpc.embedding.server import ( + LanguageModelServicer as EmbeddingServicer, + llm_embed_pb2_grpc, +) + + +def serve( + address="[::]:50051", + chat_servicer=None, + embedding_servicer=None, + completion_servicer=None, + max_workers=10, +): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers)) + llm_chat_pb2_grpc.add_LanguageModelServicer_to_server(chat_servicer, server) + llm_embed_pb2_grpc.add_LanguageModelServicer_to_server(embedding_servicer, server) + llm_pb2_grpc.add_LanguageModelServicer_to_server(completion_servicer, server) + server.add_insecure_port(address=address) + server.start() + server.wait_for_termination() + + +if __name__ == "__main__": + import argparse + + logging.basicConfig(level=logging.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--address", type=str, default="[::]:50051") + args = parser.parse_args() + + logging.info(f"Starting gRPC server on {args.address}") + model = Model() + chat_servicer = ChatServicer(model=Model()) + embedding_servicer = EmbeddingServicer(model=Model()) + completion_servicer = CompletionServicer(model=Model()) + serve( + address=args.address, + chat_servicer=chat_servicer, + embedding_servicer=embedding_servicer, + completion_servicer=completion_servicer, + ) From 47051045bfb82c40840d309bd26d5bba6b8ee7bf Mon Sep 17 00:00:00 2001 From: Nintorac Dev Date: Sun, 14 May 2023 18:23:11 +1000 Subject: [PATCH 2/2] cleanup + fix non-GPU instances not working --- examples/Raven-RWKV/get_models.py | 10 ++- examples/Raven-RWKV/lib_raven.py | 114 +++------------------------ examples/Raven-RWKV/logging.conf | 36 +++++++++ examples/Raven-RWKV/model.py | 57 +++++++------- examples/Raven-RWKV/requirements.txt | 2 +- 5 files changed, 83 insertions(+), 136 deletions(-) create mode 100644 examples/Raven-RWKV/logging.conf diff --git a/examples/Raven-RWKV/get_models.py b/examples/Raven-RWKV/get_models.py index 45c80fe..3908388 100644 --- a/examples/Raven-RWKV/get_models.py +++ b/examples/Raven-RWKV/get_models.py @@ -3,7 +3,7 @@ import requests from huggingface_hub import hf_hub_download -MODEL = "rwkv-4-pile-169m" +MODEL = "raven-1b-ctx4096" TOKENIZER_PATH = Path(__file__).parent / "20B_tokenizer.json" models = { @@ -23,6 +23,14 @@ "repo_id": "BlinkDL/rwkv-4-pile-169m", "title": "RWKV-4-Pile-169M-20220807-8023", }, + "raven-1b-ctx4096": { + "repo_id": "BlinkDL/rwkv-4-raven", + "title": "RWKV-4-Raven-1B5-v11-Eng99%-Other1%-20230425-ctx4096", + }, + "raven-3b-ctx4096": { + "repo_id": "BlinkDL/rwkv-4-raven", + "title": "RWKV-4-Raven-3B-v11-Eng99%-Other1%-20230425-ctx4096", + }, } diff --git a/examples/Raven-RWKV/lib_raven.py b/examples/Raven-RWKV/lib_raven.py index 178b5cb..85d7417 100644 --- a/examples/Raven-RWKV/lib_raven.py +++ b/examples/Raven-RWKV/lib_raven.py @@ -1,18 +1,12 @@ -import gc import logging -import os from typing import List -import torch from get_models import MODEL, TOKENIZER_PATH, get_model_path -from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit # if RWKV_CUDA_ON='1' then use CUDA kernel for seq mode (much faster) # these settings must be configured before attempting to import rwkv -os.environ["RWKV_JIT_ON"] = "1" -os.environ["RWKV_CUDA_ON"] = "1" -from rwkv.model import RWKV # noqa: E402 -from rwkv.utils import PIPELINE, PIPELINE_ARGS # noqa: E402 +from rwkv.model import RWKV +from rwkv.utils import PIPELINE, PIPELINE_ARGS STRATEGIES = { "streaming": "cuda fp16i8 *40+ -> cpu fp32 *1", # Quite slow, take ~3gb VRAM @@ -20,21 +14,17 @@ "cpu": "cpu fp32 *1", # requires a lot of RAM } -STRATEGY = STRATEGIES["streaming"] +STRATEGY = STRATEGIES["cpu"] logger = logging.getLogger(__file__) -nvmlInit() -gpu_h = nvmlDeviceGetHandleByIndex(0) ctx_limit = 4096 def get_model(): model_path = get_model_path(MODEL) - model = RWKV( - model=model_path, strategy="cuda fp16i8 *40 -> cuda fp16i8 *0+ -> cpu fp32 *1" - ) # stream mode w/some static + model = RWKV(model=model_path, strategy=STRATEGY) # stream mode w/some static pipeline = PIPELINE(model, str(TOKENIZER_PATH)) @@ -65,16 +55,17 @@ def generate_prompt(instruction, prompt=None): """ -def chat( +def complete( instruction, model, - pipeline, + pipeline: PIPELINE, prompt="", token_count=200, temperature=1.0, top_p=0.7, presencePenalty=0.1, countPenalty=0.1, + stop_words=None, ): args = PIPELINE_ARGS( temperature=max(0.2, float(temperature)), @@ -83,40 +74,11 @@ def chat( alpha_presence=presencePenalty, token_ban=[], # ban the generation of some tokens token_stop=[0], + stop_words=stop_words, ) # stop generation whenever you see any token here - ctx = instruction - - gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) - logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}") - - all_tokens = [] - out_last = 0 - out_str = "" - occurrence = {} - state = None - token = None - for i in range(int(token_count)): - out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state) - for n in occurrence: - out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency - - token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p) - if token in args.token_stop: - break - all_tokens += [token] - if token not in occurrence: - occurrence[token] = 1 - else: - occurrence[token] += 1 - - tmp = pipeline.decode(all_tokens[out_last:]) - if "\ufffd" not in tmp: - out_str += tmp - yield tmp - out_last = i + 1 - gc.collect() - torch.cuda.empty_cache() + for delta in pipeline.igenerate(ctx=instruction, token_count=token_count, args=args): + yield delta def embedding( @@ -137,9 +99,6 @@ def embedding( token_stop=[0], ) # stop generation whenever you see any token here - gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) - logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}") - context = [pipeline.encode(ctx)[-ctx_limit:] for ctx in inputs] _, state = model.forward(context[0], None) *_, embedding = state @@ -147,56 +106,3 @@ def embedding( if len(embedding.shape) == 1: embedding = embedding.unsqueeze(0) return embedding - - -def complete( - prompt, - model, - pipeline, - token_count=200, - temperature=1.0, - top_p=0.7, - presencePenalty=0.1, - countPenalty=0.1, -): - args = PIPELINE_ARGS( - temperature=max(0.2, float(temperature)), - top_p=float(top_p), - alpha_frequency=countPenalty, - alpha_presence=presencePenalty, - token_ban=[], # ban the generation of some tokens - token_stop=[0], - ) # stop generation whenever you see any token here - - ctx = prompt - - gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) - logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}") - - all_tokens = [] - out_last = 0 - out_str = "" - occurrence = {} - state = None - token = None - for i in range(int(token_count)): - out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state) - for n in occurrence: - out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency - - token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p) - if token in args.token_stop: - break - all_tokens += [token] - if token not in occurrence: - occurrence[token] = 1 - else: - occurrence[token] += 1 - - tmp = pipeline.decode(all_tokens[out_last:]) - if "\ufffd" not in tmp: - out_str += tmp - yield tmp - out_last = i + 1 - gc.collect() - torch.cuda.empty_cache() diff --git a/examples/Raven-RWKV/logging.conf b/examples/Raven-RWKV/logging.conf new file mode 100644 index 0000000..894773c --- /dev/null +++ b/examples/Raven-RWKV/logging.conf @@ -0,0 +1,36 @@ +[loggers] +keys=root,uicheckapp + +[handlers] +keys=consoleHandler,detailedConsoleHandler + +[formatters] +keys=normalFormatter,detailedFormatter + +[logger_root] +level=INFO +handlers=consoleHandler + +[logger_uicheckapp] +level=DEBUG +handlers=detailedConsoleHandler +qualname=uicheckapp +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=normalFormatter +args=(sys.stdout,) + +[handler_detailedConsoleHandler] +class=StreamHandler +level=DEBUG +formatter=detailedFormatter +args=(sys.stdout,) + +[formatter_normalFormatter] +format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s + +[formatter_detailedFormatter] +format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s call_trace=%(pathname)s L%(lineno)-4d \ No newline at end of file diff --git a/examples/Raven-RWKV/model.py b/examples/Raven-RWKV/model.py index 04ae60b..6976559 100644 --- a/examples/Raven-RWKV/model.py +++ b/examples/Raven-RWKV/model.py @@ -1,17 +1,24 @@ +import json import logging +import re from dataclasses import dataclass import lib_raven import torch from simple_ai.api.grpc.chat.server import LanguageModel -from simple_ai.utils import format_chat_log -def endOverlap(a, b): - for i in range(1, len(a) + 1): - if b.startswith(a[-i:]): - return i - return 0 +def format_chat_log(chat: list[dict[str, str]] = dict()) -> str: + raw_chat_text = "" + for item in chat: + if item["role"] not in ("user", "assistant"): + continue + role = "Bob" if item.get("role") == "user" else "Alice" + content = item.get("content").strip() + content = re.sub("\n+", "\n", content) + + raw_chat_text += f"{role}: {content}\n\n" + return raw_chat_text + "Alice:" @dataclass(unsafe_hash=True) @@ -32,7 +39,7 @@ def chat( **kwargs, ) -> str: prompt = format_chat_log(chatlog) - output = lib_raven.chat( + output = lib_raven.complete( prompt, self.model, self.pipeline, @@ -66,9 +73,11 @@ def stream_complete( top_p: int = 0.5, presencePenalty: int = 0.4, countPenalty: int = 0.4, - *args, + stop=None, + # *args, **kwargs, ) -> str: + stop = json.loads(stop) output = lib_raven.complete( prompt, self.model, @@ -78,6 +87,7 @@ def stream_complete( top_p=top_p, presencePenalty=presencePenalty, countPenalty=countPenalty, + stop_words=stop, ) yield from output @@ -94,11 +104,11 @@ def stream( ): yield [{"role": "raven"}] - stop_words = set([f"{message['role']}:" for message in chatlog]) + stop_words = ["\n\nBob:", "\n\nAlice:"] prompt = format_chat_log(chatlog) - chunk = "" - for delta in lib_raven.chat( + first = True + for delta in lib_raven.complete( prompt, self.model, self.pipeline, @@ -108,26 +118,13 @@ def stream( top_p=top_p, presencePenalty=presencePenalty, countPenalty=countPenalty, + stop_words=stop_words, ): - chunk = chunk + delta - longest_stopword = max(map(len, stop_words)) - - if start_idx := max(map(lambda stop_word: endOverlap(chunk, stop_word), stop_words)): - if start_idx > longest_stopword: - start_idx = longest_stopword # can no longer be a stopword so cut it down - good, chunk = chunk[:-start_idx], chunk[-start_idx:] - - if good: - yield [{"content": good}] - - if any(map(lambda stop_word: chunk.startswith(stop_word), stop_words)): - return - continue - - # if start_idx:=max(map(lambda stop_word: endOverlap(stop_word, chunk), stop_words))>0: - - yield [{"content": chunk}] - chunk = "" + clean_delta = delta + if first: + clean_delta = delta[1:] ## remove leading whitespace in completion + first = False + yield [{"content": clean_delta}] def embed( self, diff --git a/examples/Raven-RWKV/requirements.txt b/examples/Raven-RWKV/requirements.txt index ab1377a..2d40d61 100644 --- a/examples/Raven-RWKV/requirements.txt +++ b/examples/Raven-RWKV/requirements.txt @@ -2,10 +2,10 @@ inquirer==3.1.3 loralib==0.1.1 ninja==1.11.1 pynvml==11.5.0 -rwkv==0.6.2 scipy==1.10.1 sentencepiece==0.1.97 simple-ai-server==0.2.0 tokenizers==0.13.2 torch==2.0.0 transformers==4.27.3 +git+https://github.com/Nintorac/ChatRWKV.git@parallel#subdirectory=rwkv_pip_package