From f6b0698683fbb757d3289cf7bf0ac239e7c44d61 Mon Sep 17 00:00:00 2001
From: Nintorac Dev <Nintorac@users.noreply.github.com>
Date: Thu, 20 Apr 2023 19:01:53 +1000
Subject: [PATCH 1/2] feature: added RWKV example

---
 examples/Raven-RWKV/Dockerfile       |  44 ++++++
 examples/Raven-RWKV/README.md        |  48 +++++++
 examples/Raven-RWKV/get_models.py    |  52 +++++++
 examples/Raven-RWKV/lib_raven.py     | 202 +++++++++++++++++++++++++++
 examples/Raven-RWKV/model.py         | 141 +++++++++++++++++++
 examples/Raven-RWKV/requirements.txt |  11 ++
 examples/Raven-RWKV/server.py        |  55 ++++++++
 7 files changed, 553 insertions(+)
 create mode 100644 examples/Raven-RWKV/Dockerfile
 create mode 100644 examples/Raven-RWKV/README.md
 create mode 100644 examples/Raven-RWKV/get_models.py
 create mode 100644 examples/Raven-RWKV/lib_raven.py
 create mode 100644 examples/Raven-RWKV/model.py
 create mode 100644 examples/Raven-RWKV/requirements.txt
 create mode 100644 examples/Raven-RWKV/server.py

diff --git a/examples/Raven-RWKV/Dockerfile b/examples/Raven-RWKV/Dockerfile
new file mode 100644
index 0000000..f0232a1
--- /dev/null
+++ b/examples/Raven-RWKV/Dockerfile
@@ -0,0 +1,44 @@
+FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
+
+# Update, install
+RUN apt-get update && \
+    apt-get install -y build-essential ninja-build git wget
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
+    rm Miniconda3-latest-Linux-x86_64.sh && \
+    /opt/conda/bin/conda create -y --name py39 python=3.9 && \
+    /opt/conda/bin/conda clean -ya
+
+ENV PATH /opt/conda/envs/py39/bin:$PATH
+
+RUN pip install --upgrade pip setuptools wheel
+
+# Create user instead of using root
+ENV USER='user'
+RUN groupadd -r user && useradd -r -g $USER $USER
+RUN mkdir -p /home/$USER/app
+RUN chown -R $USER:$USER /home/$USER
+USER $USER
+
+# Define workdir
+WORKDIR /home/$USER/app
+
+# Install project
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+COPY get_models.py .
+
+# Get model weights and tokenizer
+RUN python3 get_models.py
+
+# Copy rest
+COPY . .
+
+# Publish port
+EXPOSE 50051:50051
+
+# Enjoy
+ENTRYPOINT ["python3", "server.py"]
+CMD ["--address", "[::]:50051"]
\ No newline at end of file
diff --git a/examples/Raven-RWKV/README.md b/examples/Raven-RWKV/README.md
new file mode 100644
index 0000000..cd72fbb
--- /dev/null
+++ b/examples/Raven-RWKV/README.md
@@ -0,0 +1,48 @@
+# RavenRWKV service
+
+## Description
+
+This project uses the [RWKV-LM](https://github.com/BlinkDL/RWKV-LM) model and turns it into an gRPC service that can be used through [SimpleAI](https://github.com/lhenault/simpleAI).
+
+RWKV is an RNN with Transformer-level language model performance that can be trained like a GPT transformer and is 100% attention-free. It combines the best of RNN and transformer, providing great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding.
+
+## Usage
+
+Edit the `MODEL` variable in `get_models.py` to choose the model size and context.
+
+Edit the `STRATEGY`  variable in `lib_raven.py` to decide how the weights will be loaded, play with this to optimise the throughput for your system. See below for a graphic explanation or checkout [ChatRWKV](https://github.com/BlinkDL/ChatRWKV) for more information.
+
+![Strategies as of 20 Apr 2023](https://raw.githubusercontent.com/BlinkDL/ChatRWKV/536b4b3bf87fbd999798141f409b151ca91a76c7/ChatRWKV-strategy.png)
+
+## Build
+
+```bash
+docker build . -t raven-rwkv-service:latest
+```
+
+## Start service
+
+```bash
+docker run -it --rm -p 50051:50051 --gpus all raven-rwkv-service:latest
+```
+
+## Add to model.toml
+
+```
+```toml
+[raven]
+    [raven.metadata]
+        owned_by    = 'BlinkDL'
+        permission  = []
+        description = 'RWKV fine tuned for instruction answering'
+    [raven.network]
+        url = 'localhost:50051'
+```
+
+```
+
+## Credits
+
+Heavily borrowed from lhenault & BlinkDL
+
+https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B
\ No newline at end of file
diff --git a/examples/Raven-RWKV/get_models.py b/examples/Raven-RWKV/get_models.py
new file mode 100644
index 0000000..45c80fe
--- /dev/null
+++ b/examples/Raven-RWKV/get_models.py
@@ -0,0 +1,52 @@
+from pathlib import Path
+
+import requests
+from huggingface_hub import hf_hub_download
+
+MODEL = "rwkv-4-pile-169m"
+
+TOKENIZER_PATH = Path(__file__).parent / "20B_tokenizer.json"
+models = {
+    "raven-14b-ctx4096": {
+        "repo_id": "BlinkDL/rwkv-4-raven",
+        "title": "RWKV-4-Raven-14B-v8-Eng-20230408-ctx4096",
+    },
+    "raven-7b-ctx4096": {
+        "repo_id": "BlinkDL/rwkv-4-raven",
+        "title": "RWKV-4-Raven-7B-v7-Eng-20230404-ctx4096",
+    },
+    "raven-7b-ctx1024": {
+        "repo_id": "BlinkDL/rwkv-4-pile-7b",
+        "title": "RWKV-4-Pile-7B-Instruct-test4-20230326",
+    },
+    "rwkv-4-pile-169m": {
+        "repo_id": "BlinkDL/rwkv-4-pile-169m",
+        "title": "RWKV-4-Pile-169M-20220807-8023",
+    },
+}
+
+
+def fetch_tokenizer(tokenizer_path: Path):
+    url = "https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B/raw/main/20B_tokenizer.json"
+    tokenizer_path.parent.mkdir(exist_ok=True)
+
+    response = requests.get(url)
+    tokenizer_path.write_bytes(response.content)
+
+
+def get_model_path(model="rwkv-4-pile-169m"):
+    tokenizer_path = Path(__file__).parent / "20B_tokenizer.json"
+    if not tokenizer_path.exists():
+        fetch_tokenizer(tokenizer_path)
+
+    model_params = models[model]
+
+    model_path = hf_hub_download(
+        repo_id=model_params["repo_id"], filename=f"{model_params['title']}.pth"
+    )
+
+    return model_path
+
+
+if __name__ == "__main__":
+    get_model_path(MODEL)
diff --git a/examples/Raven-RWKV/lib_raven.py b/examples/Raven-RWKV/lib_raven.py
new file mode 100644
index 0000000..178b5cb
--- /dev/null
+++ b/examples/Raven-RWKV/lib_raven.py
@@ -0,0 +1,202 @@
+import gc
+import logging
+import os
+from typing import List
+
+import torch
+from get_models import MODEL, TOKENIZER_PATH, get_model_path
+from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit
+
+# if RWKV_CUDA_ON='1' then use CUDA kernel for seq mode (much faster)
+# these settings must be configured before attempting to import rwkv
+os.environ["RWKV_JIT_ON"] = "1"
+os.environ["RWKV_CUDA_ON"] = "1"
+from rwkv.model import RWKV  # noqa: E402
+from rwkv.utils import PIPELINE, PIPELINE_ARGS  # noqa: E402
+
+STRATEGIES = {
+    "streaming": "cuda fp16i8 *40+ -> cpu fp32 *1",  # Quite slow, take ~3gb VRAM
+    "fp16i8": "cuda fp16i8 *40 -> cpu fp32 *1",  # fits the 14b on a T4, quite fast
+    "cpu": "cpu fp32 *1",  # requires a lot of RAM
+}
+
+STRATEGY = STRATEGIES["streaming"]
+
+logger = logging.getLogger(__file__)
+
+nvmlInit()
+gpu_h = nvmlDeviceGetHandleByIndex(0)
+ctx_limit = 4096
+
+
+def get_model():
+    model_path = get_model_path(MODEL)
+
+    model = RWKV(
+        model=model_path, strategy="cuda fp16i8 *40 -> cuda fp16i8 *0+ -> cpu fp32 *1"
+    )  # stream mode w/some static
+
+    pipeline = PIPELINE(model, str(TOKENIZER_PATH))
+
+    return model, pipeline
+
+
+def generate_prompt(instruction, prompt=None):
+    if prompt:
+        return f"""Below is an instruction that describes a task, paired with an input"\
+        " that provides further context. Write a response that appropriately completes the request.
+
+# Instruction:
+{instruction}
+
+# Input:
+{prompt}
+
+# Response:
+"""
+    else:
+        return f"""Below is an instruction that describes a task. Write a response that "\
+                    "appropriately completes the request.
+
+# Instruction:
+{instruction}
+
+# Response:
+"""
+
+
+def chat(
+    instruction,
+    model,
+    pipeline,
+    prompt="",
+    token_count=200,
+    temperature=1.0,
+    top_p=0.7,
+    presencePenalty=0.1,
+    countPenalty=0.1,
+):
+    args = PIPELINE_ARGS(
+        temperature=max(0.2, float(temperature)),
+        top_p=float(top_p),
+        alpha_frequency=countPenalty,
+        alpha_presence=presencePenalty,
+        token_ban=[],  # ban the generation of some tokens
+        token_stop=[0],
+    )  # stop generation whenever you see any token here
+
+    ctx = instruction
+
+    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+    logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}")
+
+    all_tokens = []
+    out_last = 0
+    out_str = ""
+    occurrence = {}
+    state = None
+    token = None
+    for i in range(int(token_count)):
+        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
+        for n in occurrence:
+            out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
+
+        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
+        if token in args.token_stop:
+            break
+        all_tokens += [token]
+        if token not in occurrence:
+            occurrence[token] = 1
+        else:
+            occurrence[token] += 1
+
+        tmp = pipeline.decode(all_tokens[out_last:])
+        if "\ufffd" not in tmp:
+            out_str += tmp
+            yield tmp
+            out_last = i + 1
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def embedding(
+    inputs: List[str],
+    model,
+    pipeline,
+    temperature=1.0,  # TODO remove
+    top_p=0.7,
+    presencePenalty=0.1,
+    countPenalty=0.1,
+):
+    PIPELINE_ARGS(
+        temperature=max(0.2, float(temperature)),
+        top_p=float(top_p),
+        alpha_frequency=countPenalty,
+        alpha_presence=presencePenalty,
+        token_ban=[],  # ban the generation of some tokens
+        token_stop=[0],
+    )  # stop generation whenever you see any token here
+
+    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+    logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}")
+
+    context = [pipeline.encode(ctx)[-ctx_limit:] for ctx in inputs]
+    _, state = model.forward(context[0], None)
+    *_, embedding = state
+
+    if len(embedding.shape) == 1:
+        embedding = embedding.unsqueeze(0)
+    return embedding
+
+
+def complete(
+    prompt,
+    model,
+    pipeline,
+    token_count=200,
+    temperature=1.0,
+    top_p=0.7,
+    presencePenalty=0.1,
+    countPenalty=0.1,
+):
+    args = PIPELINE_ARGS(
+        temperature=max(0.2, float(temperature)),
+        top_p=float(top_p),
+        alpha_frequency=countPenalty,
+        alpha_presence=presencePenalty,
+        token_ban=[],  # ban the generation of some tokens
+        token_stop=[0],
+    )  # stop generation whenever you see any token here
+
+    ctx = prompt
+
+    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+    logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}")
+
+    all_tokens = []
+    out_last = 0
+    out_str = ""
+    occurrence = {}
+    state = None
+    token = None
+    for i in range(int(token_count)):
+        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
+        for n in occurrence:
+            out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
+
+        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
+        if token in args.token_stop:
+            break
+        all_tokens += [token]
+        if token not in occurrence:
+            occurrence[token] = 1
+        else:
+            occurrence[token] += 1
+
+        tmp = pipeline.decode(all_tokens[out_last:])
+        if "\ufffd" not in tmp:
+            out_str += tmp
+            yield tmp
+            out_last = i + 1
+    gc.collect()
+    torch.cuda.empty_cache()
diff --git a/examples/Raven-RWKV/model.py b/examples/Raven-RWKV/model.py
new file mode 100644
index 0000000..04ae60b
--- /dev/null
+++ b/examples/Raven-RWKV/model.py
@@ -0,0 +1,141 @@
+import logging
+from dataclasses import dataclass
+
+import lib_raven
+import torch
+from simple_ai.api.grpc.chat.server import LanguageModel
+from simple_ai.utils import format_chat_log
+
+
+def endOverlap(a, b):
+    for i in range(1, len(a) + 1):
+        if b.startswith(a[-i:]):
+            return i
+    return 0
+
+
+@dataclass(unsafe_hash=True)
+class RavenRWKVModel(LanguageModel):
+    gpu_id: int = 0
+    device = torch.device("cuda", gpu_id) if torch.cuda.is_available() else torch.device("cpu")
+    model, pipeline = lib_raven.get_model()
+
+    def chat(
+        self,
+        chatlog: list[list[str]] = None,
+        max_tokens: int = 512,
+        temperature: float = 0.9,
+        top_p: int = 0.5,
+        presencePenalty: int = 0.4,
+        countPenalty: int = 0.4,
+        *args,
+        **kwargs,
+    ) -> str:
+        prompt = format_chat_log(chatlog)
+        output = lib_raven.chat(
+            prompt,
+            self.model,
+            self.pipeline,
+            prompt=None,
+            token_count=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            presencePenalty=presencePenalty,
+            countPenalty=countPenalty,
+        )
+
+        output = "".join(output)
+
+        return [{"role": "raven", "content": output}]
+
+    def complete(
+        self,
+        *args,
+        **kwargs,
+    ) -> str:
+        output = self.stream_complete(*args, **kwargs)
+        output = "".join(output)
+
+        return output
+
+    def stream_complete(
+        self,
+        prompt: str = None,
+        max_tokens: int = 512,
+        temperature: float = 0.9,
+        top_p: int = 0.5,
+        presencePenalty: int = 0.4,
+        countPenalty: int = 0.4,
+        *args,
+        **kwargs,
+    ) -> str:
+        output = lib_raven.complete(
+            prompt,
+            self.model,
+            self.pipeline,
+            token_count=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            presencePenalty=presencePenalty,
+            countPenalty=countPenalty,
+        )
+        yield from output
+
+    def stream(
+        self,
+        chatlog: list[list[str]] = None,
+        max_tokens: int = 512,
+        temperature: float = 0.9,
+        top_p: int = 0.5,
+        presencePenalty: int = 0.4,
+        countPenalty: int = 0.4,
+        *args,
+        **kwargs,
+    ):
+        yield [{"role": "raven"}]
+
+        stop_words = set([f"{message['role']}:" for message in chatlog])
+
+        prompt = format_chat_log(chatlog)
+        chunk = ""
+        for delta in lib_raven.chat(
+            prompt,
+            self.model,
+            self.pipeline,
+            prompt=None,
+            token_count=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            presencePenalty=presencePenalty,
+            countPenalty=countPenalty,
+        ):
+            chunk = chunk + delta
+            longest_stopword = max(map(len, stop_words))
+
+            if start_idx := max(map(lambda stop_word: endOverlap(chunk, stop_word), stop_words)):
+                if start_idx > longest_stopword:
+                    start_idx = longest_stopword  # can no longer be a stopword so cut it down
+                good, chunk = chunk[:-start_idx], chunk[-start_idx:]
+
+                if good:
+                    yield [{"content": good}]
+
+                if any(map(lambda stop_word: chunk.startswith(stop_word), stop_words)):
+                    return
+                continue
+
+            # if start_idx:=max(map(lambda stop_word: endOverlap(stop_word, chunk), stop_words))>0:
+
+            yield [{"content": chunk}]
+            chunk = ""
+
+    def embed(
+        self,
+        inputs: list = [],
+    ) -> list:
+        logging.info(f"Processing inputs : {inputs}")
+        embeddings = lib_raven.embedding(inputs, self.model, self.pipeline)
+        logging.info(
+            f"Successfully computed embeddings (shape : {embeddings.shape}) for inputs : {inputs}"
+        )
+        return embeddings.tolist()
diff --git a/examples/Raven-RWKV/requirements.txt b/examples/Raven-RWKV/requirements.txt
new file mode 100644
index 0000000..ab1377a
--- /dev/null
+++ b/examples/Raven-RWKV/requirements.txt
@@ -0,0 +1,11 @@
+inquirer==3.1.3
+loralib==0.1.1
+ninja==1.11.1
+pynvml==11.5.0
+rwkv==0.6.2
+scipy==1.10.1
+sentencepiece==0.1.97
+simple-ai-server==0.2.0
+tokenizers==0.13.2
+torch==2.0.0
+transformers==4.27.3
diff --git a/examples/Raven-RWKV/server.py b/examples/Raven-RWKV/server.py
new file mode 100644
index 0000000..d663326
--- /dev/null
+++ b/examples/Raven-RWKV/server.py
@@ -0,0 +1,55 @@
+import logging
+from concurrent import futures
+
+import grpc
+from model import RavenRWKVModel as Model
+from simple_ai.api.grpc.chat.server import (
+    LanguageModelServicer as ChatServicer,
+    llm_chat_pb2_grpc,
+)
+from simple_ai.api.grpc.completion.server import (
+    LanguageModelServicer as CompletionServicer,
+    llm_pb2_grpc,
+)
+from simple_ai.api.grpc.embedding.server import (
+    LanguageModelServicer as EmbeddingServicer,
+    llm_embed_pb2_grpc,
+)
+
+
+def serve(
+    address="[::]:50051",
+    chat_servicer=None,
+    embedding_servicer=None,
+    completion_servicer=None,
+    max_workers=10,
+):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
+    llm_chat_pb2_grpc.add_LanguageModelServicer_to_server(chat_servicer, server)
+    llm_embed_pb2_grpc.add_LanguageModelServicer_to_server(embedding_servicer, server)
+    llm_pb2_grpc.add_LanguageModelServicer_to_server(completion_servicer, server)
+    server.add_insecure_port(address=address)
+    server.start()
+    server.wait_for_termination()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-a", "--address", type=str, default="[::]:50051")
+    args = parser.parse_args()
+
+    logging.info(f"Starting gRPC server on {args.address}")
+    model = Model()
+    chat_servicer = ChatServicer(model=Model())
+    embedding_servicer = EmbeddingServicer(model=Model())
+    completion_servicer = CompletionServicer(model=Model())
+    serve(
+        address=args.address,
+        chat_servicer=chat_servicer,
+        embedding_servicer=embedding_servicer,
+        completion_servicer=completion_servicer,
+    )

From 47051045bfb82c40840d309bd26d5bba6b8ee7bf Mon Sep 17 00:00:00 2001
From: Nintorac Dev <Nintorac@users.noreply.github.com>
Date: Sun, 14 May 2023 18:23:11 +1000
Subject: [PATCH 2/2] cleanup + fix non-GPU instances not working

---
 examples/Raven-RWKV/get_models.py    |  10 ++-
 examples/Raven-RWKV/lib_raven.py     | 114 +++------------------------
 examples/Raven-RWKV/logging.conf     |  36 +++++++++
 examples/Raven-RWKV/model.py         |  57 +++++++-------
 examples/Raven-RWKV/requirements.txt |   2 +-
 5 files changed, 83 insertions(+), 136 deletions(-)
 create mode 100644 examples/Raven-RWKV/logging.conf

diff --git a/examples/Raven-RWKV/get_models.py b/examples/Raven-RWKV/get_models.py
index 45c80fe..3908388 100644
--- a/examples/Raven-RWKV/get_models.py
+++ b/examples/Raven-RWKV/get_models.py
@@ -3,7 +3,7 @@
 import requests
 from huggingface_hub import hf_hub_download
 
-MODEL = "rwkv-4-pile-169m"
+MODEL = "raven-1b-ctx4096"
 
 TOKENIZER_PATH = Path(__file__).parent / "20B_tokenizer.json"
 models = {
@@ -23,6 +23,14 @@
         "repo_id": "BlinkDL/rwkv-4-pile-169m",
         "title": "RWKV-4-Pile-169M-20220807-8023",
     },
+    "raven-1b-ctx4096": {
+        "repo_id": "BlinkDL/rwkv-4-raven",
+        "title": "RWKV-4-Raven-1B5-v11-Eng99%-Other1%-20230425-ctx4096",
+    },
+    "raven-3b-ctx4096": {
+        "repo_id": "BlinkDL/rwkv-4-raven",
+        "title": "RWKV-4-Raven-3B-v11-Eng99%-Other1%-20230425-ctx4096",
+    },
 }
 
 
diff --git a/examples/Raven-RWKV/lib_raven.py b/examples/Raven-RWKV/lib_raven.py
index 178b5cb..85d7417 100644
--- a/examples/Raven-RWKV/lib_raven.py
+++ b/examples/Raven-RWKV/lib_raven.py
@@ -1,18 +1,12 @@
-import gc
 import logging
-import os
 from typing import List
 
-import torch
 from get_models import MODEL, TOKENIZER_PATH, get_model_path
-from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit
 
 # if RWKV_CUDA_ON='1' then use CUDA kernel for seq mode (much faster)
 # these settings must be configured before attempting to import rwkv
-os.environ["RWKV_JIT_ON"] = "1"
-os.environ["RWKV_CUDA_ON"] = "1"
-from rwkv.model import RWKV  # noqa: E402
-from rwkv.utils import PIPELINE, PIPELINE_ARGS  # noqa: E402
+from rwkv.model import RWKV
+from rwkv.utils import PIPELINE, PIPELINE_ARGS
 
 STRATEGIES = {
     "streaming": "cuda fp16i8 *40+ -> cpu fp32 *1",  # Quite slow, take ~3gb VRAM
@@ -20,21 +14,17 @@
     "cpu": "cpu fp32 *1",  # requires a lot of RAM
 }
 
-STRATEGY = STRATEGIES["streaming"]
+STRATEGY = STRATEGIES["cpu"]
 
 logger = logging.getLogger(__file__)
 
-nvmlInit()
-gpu_h = nvmlDeviceGetHandleByIndex(0)
 ctx_limit = 4096
 
 
 def get_model():
     model_path = get_model_path(MODEL)
 
-    model = RWKV(
-        model=model_path, strategy="cuda fp16i8 *40 -> cuda fp16i8 *0+ -> cpu fp32 *1"
-    )  # stream mode w/some static
+    model = RWKV(model=model_path, strategy=STRATEGY)  # stream mode w/some static
 
     pipeline = PIPELINE(model, str(TOKENIZER_PATH))
 
@@ -65,16 +55,17 @@ def generate_prompt(instruction, prompt=None):
 """
 
 
-def chat(
+def complete(
     instruction,
     model,
-    pipeline,
+    pipeline: PIPELINE,
     prompt="",
     token_count=200,
     temperature=1.0,
     top_p=0.7,
     presencePenalty=0.1,
     countPenalty=0.1,
+    stop_words=None,
 ):
     args = PIPELINE_ARGS(
         temperature=max(0.2, float(temperature)),
@@ -83,40 +74,11 @@ def chat(
         alpha_presence=presencePenalty,
         token_ban=[],  # ban the generation of some tokens
         token_stop=[0],
+        stop_words=stop_words,
     )  # stop generation whenever you see any token here
 
-    ctx = instruction
-
-    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
-    logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}")
-
-    all_tokens = []
-    out_last = 0
-    out_str = ""
-    occurrence = {}
-    state = None
-    token = None
-    for i in range(int(token_count)):
-        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
-        for n in occurrence:
-            out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
-
-        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
-        if token in args.token_stop:
-            break
-        all_tokens += [token]
-        if token not in occurrence:
-            occurrence[token] = 1
-        else:
-            occurrence[token] += 1
-
-        tmp = pipeline.decode(all_tokens[out_last:])
-        if "\ufffd" not in tmp:
-            out_str += tmp
-            yield tmp
-            out_last = i + 1
-    gc.collect()
-    torch.cuda.empty_cache()
+    for delta in pipeline.igenerate(ctx=instruction, token_count=token_count, args=args):
+        yield delta
 
 
 def embedding(
@@ -137,9 +99,6 @@ def embedding(
         token_stop=[0],
     )  # stop generation whenever you see any token here
 
-    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
-    logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}")
-
     context = [pipeline.encode(ctx)[-ctx_limit:] for ctx in inputs]
     _, state = model.forward(context[0], None)
     *_, embedding = state
@@ -147,56 +106,3 @@ def embedding(
     if len(embedding.shape) == 1:
         embedding = embedding.unsqueeze(0)
     return embedding
-
-
-def complete(
-    prompt,
-    model,
-    pipeline,
-    token_count=200,
-    temperature=1.0,
-    top_p=0.7,
-    presencePenalty=0.1,
-    countPenalty=0.1,
-):
-    args = PIPELINE_ARGS(
-        temperature=max(0.2, float(temperature)),
-        top_p=float(top_p),
-        alpha_frequency=countPenalty,
-        alpha_presence=presencePenalty,
-        token_ban=[],  # ban the generation of some tokens
-        token_stop=[0],
-    )  # stop generation whenever you see any token here
-
-    ctx = prompt
-
-    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
-    logger.debug(f"vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}")
-
-    all_tokens = []
-    out_last = 0
-    out_str = ""
-    occurrence = {}
-    state = None
-    token = None
-    for i in range(int(token_count)):
-        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
-        for n in occurrence:
-            out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
-
-        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
-        if token in args.token_stop:
-            break
-        all_tokens += [token]
-        if token not in occurrence:
-            occurrence[token] = 1
-        else:
-            occurrence[token] += 1
-
-        tmp = pipeline.decode(all_tokens[out_last:])
-        if "\ufffd" not in tmp:
-            out_str += tmp
-            yield tmp
-            out_last = i + 1
-    gc.collect()
-    torch.cuda.empty_cache()
diff --git a/examples/Raven-RWKV/logging.conf b/examples/Raven-RWKV/logging.conf
new file mode 100644
index 0000000..894773c
--- /dev/null
+++ b/examples/Raven-RWKV/logging.conf
@@ -0,0 +1,36 @@
+[loggers]
+keys=root,uicheckapp
+
+[handlers]
+keys=consoleHandler,detailedConsoleHandler
+
+[formatters]
+keys=normalFormatter,detailedFormatter
+
+[logger_root]
+level=INFO
+handlers=consoleHandler
+
+[logger_uicheckapp]
+level=DEBUG
+handlers=detailedConsoleHandler
+qualname=uicheckapp
+propagate=0
+
+[handler_consoleHandler]
+class=StreamHandler
+level=DEBUG
+formatter=normalFormatter
+args=(sys.stdout,)
+
+[handler_detailedConsoleHandler]
+class=StreamHandler
+level=DEBUG
+formatter=detailedFormatter
+args=(sys.stdout,)
+
+[formatter_normalFormatter]
+format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s
+
+[formatter_detailedFormatter]
+format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s   call_trace=%(pathname)s L%(lineno)-4d
\ No newline at end of file
diff --git a/examples/Raven-RWKV/model.py b/examples/Raven-RWKV/model.py
index 04ae60b..6976559 100644
--- a/examples/Raven-RWKV/model.py
+++ b/examples/Raven-RWKV/model.py
@@ -1,17 +1,24 @@
+import json
 import logging
+import re
 from dataclasses import dataclass
 
 import lib_raven
 import torch
 from simple_ai.api.grpc.chat.server import LanguageModel
-from simple_ai.utils import format_chat_log
 
 
-def endOverlap(a, b):
-    for i in range(1, len(a) + 1):
-        if b.startswith(a[-i:]):
-            return i
-    return 0
+def format_chat_log(chat: list[dict[str, str]] = dict()) -> str:
+    raw_chat_text = ""
+    for item in chat:
+        if item["role"] not in ("user", "assistant"):
+            continue
+        role = "Bob" if item.get("role") == "user" else "Alice"
+        content = item.get("content").strip()
+        content = re.sub("\n+", "\n", content)
+
+        raw_chat_text += f"{role}: {content}\n\n"
+    return raw_chat_text + "Alice:"
 
 
 @dataclass(unsafe_hash=True)
@@ -32,7 +39,7 @@ def chat(
         **kwargs,
     ) -> str:
         prompt = format_chat_log(chatlog)
-        output = lib_raven.chat(
+        output = lib_raven.complete(
             prompt,
             self.model,
             self.pipeline,
@@ -66,9 +73,11 @@ def stream_complete(
         top_p: int = 0.5,
         presencePenalty: int = 0.4,
         countPenalty: int = 0.4,
-        *args,
+        stop=None,
+        # *args,
         **kwargs,
     ) -> str:
+        stop = json.loads(stop)
         output = lib_raven.complete(
             prompt,
             self.model,
@@ -78,6 +87,7 @@ def stream_complete(
             top_p=top_p,
             presencePenalty=presencePenalty,
             countPenalty=countPenalty,
+            stop_words=stop,
         )
         yield from output
 
@@ -94,11 +104,11 @@ def stream(
     ):
         yield [{"role": "raven"}]
 
-        stop_words = set([f"{message['role']}:" for message in chatlog])
+        stop_words = ["\n\nBob:", "\n\nAlice:"]
 
         prompt = format_chat_log(chatlog)
-        chunk = ""
-        for delta in lib_raven.chat(
+        first = True
+        for delta in lib_raven.complete(
             prompt,
             self.model,
             self.pipeline,
@@ -108,26 +118,13 @@ def stream(
             top_p=top_p,
             presencePenalty=presencePenalty,
             countPenalty=countPenalty,
+            stop_words=stop_words,
         ):
-            chunk = chunk + delta
-            longest_stopword = max(map(len, stop_words))
-
-            if start_idx := max(map(lambda stop_word: endOverlap(chunk, stop_word), stop_words)):
-                if start_idx > longest_stopword:
-                    start_idx = longest_stopword  # can no longer be a stopword so cut it down
-                good, chunk = chunk[:-start_idx], chunk[-start_idx:]
-
-                if good:
-                    yield [{"content": good}]
-
-                if any(map(lambda stop_word: chunk.startswith(stop_word), stop_words)):
-                    return
-                continue
-
-            # if start_idx:=max(map(lambda stop_word: endOverlap(stop_word, chunk), stop_words))>0:
-
-            yield [{"content": chunk}]
-            chunk = ""
+            clean_delta = delta
+            if first:
+                clean_delta = delta[1:]  ## remove leading whitespace in completion
+                first = False
+            yield [{"content": clean_delta}]
 
     def embed(
         self,
diff --git a/examples/Raven-RWKV/requirements.txt b/examples/Raven-RWKV/requirements.txt
index ab1377a..2d40d61 100644
--- a/examples/Raven-RWKV/requirements.txt
+++ b/examples/Raven-RWKV/requirements.txt
@@ -2,10 +2,10 @@ inquirer==3.1.3
 loralib==0.1.1
 ninja==1.11.1
 pynvml==11.5.0
-rwkv==0.6.2
 scipy==1.10.1
 sentencepiece==0.1.97
 simple-ai-server==0.2.0
 tokenizers==0.13.2
 torch==2.0.0
 transformers==4.27.3
+git+https://github.com/Nintorac/ChatRWKV.git@parallel#subdirectory=rwkv_pip_package