diff --git a/examples/Raven-RWKV/Dockerfile b/examples/Raven-RWKV/Dockerfile
new file mode 100644
index 0000000..f0232a1
--- /dev/null
+++ b/examples/Raven-RWKV/Dockerfile
@@ -0,0 +1,44 @@
+FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
+
+# Update, install
+RUN apt-get update && \
+    apt-get install -y build-essential ninja-build git wget
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
+    rm Miniconda3-latest-Linux-x86_64.sh && \
+    /opt/conda/bin/conda create -y --name py39 python=3.9 && \
+    /opt/conda/bin/conda clean -ya
+
+ENV PATH /opt/conda/envs/py39/bin:$PATH
+
+RUN pip install --upgrade pip setuptools wheel
+
+# Create user instead of using root
+ENV USER='user'
+RUN groupadd -r user && useradd -r -g $USER $USER
+RUN mkdir -p /home/$USER/app
+RUN chown -R $USER:$USER /home/$USER
+USER $USER
+
+# Define workdir
+WORKDIR /home/$USER/app
+
+# Install project
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+COPY get_models.py .
+
+# Get model weights and tokenizer
+RUN python3 get_models.py
+
+# Copy rest
+COPY . .
+
+# Publish port
+EXPOSE 50051:50051
+
+# Enjoy
+ENTRYPOINT ["python3", "server.py"]
+CMD ["--address", "[::]:50051"]
\ No newline at end of file
diff --git a/examples/Raven-RWKV/README.md b/examples/Raven-RWKV/README.md
new file mode 100644
index 0000000..cd72fbb
--- /dev/null
+++ b/examples/Raven-RWKV/README.md
@@ -0,0 +1,48 @@
+# RavenRWKV service
+
+## Description
+
+This project uses the [RWKV-LM](https://github.com/BlinkDL/RWKV-LM) model and turns it into an gRPC service that can be used through [SimpleAI](https://github.com/lhenault/simpleAI).
+
+RWKV is an RNN with Transformer-level language model performance that can be trained like a GPT transformer and is 100% attention-free. It combines the best of RNN and transformer, providing great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding.
+
+## Usage
+
+Edit the `MODEL` variable in `get_models.py` to choose the model size and context.
+
+Edit the `STRATEGY`  variable in `lib_raven.py` to decide how the weights will be loaded, play with this to optimise the throughput for your system. See below for a graphic explanation or checkout [ChatRWKV](https://github.com/BlinkDL/ChatRWKV) for more information.
+
+![Strategies as of 20 Apr 2023](https://raw.githubusercontent.com/BlinkDL/ChatRWKV/536b4b3bf87fbd999798141f409b151ca91a76c7/ChatRWKV-strategy.png)
+
+## Build
+
+```bash
+docker build . -t raven-rwkv-service:latest
+```
+
+## Start service
+
+```bash
+docker run -it --rm -p 50051:50051 --gpus all raven-rwkv-service:latest
+```
+
+## Add to model.toml
+
+```
+```toml
+[raven]
+    [raven.metadata]
+        owned_by    = 'BlinkDL'
+        permission  = []
+        description = 'RWKV fine tuned for instruction answering'
+    [raven.network]
+        url = 'localhost:50051'
+```
+
+```
+
+## Credits
+
+Heavily borrowed from lhenault & BlinkDL
+
+https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B
\ No newline at end of file
diff --git a/examples/Raven-RWKV/get_models.py b/examples/Raven-RWKV/get_models.py
new file mode 100644
index 0000000..3908388
--- /dev/null
+++ b/examples/Raven-RWKV/get_models.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+
+import requests
+from huggingface_hub import hf_hub_download
+
+MODEL = "raven-1b-ctx4096"
+
+TOKENIZER_PATH = Path(__file__).parent / "20B_tokenizer.json"
+models = {
+    "raven-14b-ctx4096": {
+        "repo_id": "BlinkDL/rwkv-4-raven",
+        "title": "RWKV-4-Raven-14B-v8-Eng-20230408-ctx4096",
+    },
+    "raven-7b-ctx4096": {
+        "repo_id": "BlinkDL/rwkv-4-raven",
+        "title": "RWKV-4-Raven-7B-v7-Eng-20230404-ctx4096",
+    },
+    "raven-7b-ctx1024": {
+        "repo_id": "BlinkDL/rwkv-4-pile-7b",
+        "title": "RWKV-4-Pile-7B-Instruct-test4-20230326",
+    },
+    "rwkv-4-pile-169m": {
+        "repo_id": "BlinkDL/rwkv-4-pile-169m",
+        "title": "RWKV-4-Pile-169M-20220807-8023",
+    },
+    "raven-1b-ctx4096": {
+        "repo_id": "BlinkDL/rwkv-4-raven",
+        "title": "RWKV-4-Raven-1B5-v11-Eng99%-Other1%-20230425-ctx4096",
+    },
+    "raven-3b-ctx4096": {
+        "repo_id": "BlinkDL/rwkv-4-raven",
+        "title": "RWKV-4-Raven-3B-v11-Eng99%-Other1%-20230425-ctx4096",
+    },
+}
+
+
+def fetch_tokenizer(tokenizer_path: Path):
+    url = "https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B/raw/main/20B_tokenizer.json"
+    tokenizer_path.parent.mkdir(exist_ok=True)
+
+    response = requests.get(url)
+    tokenizer_path.write_bytes(response.content)
+
+
+def get_model_path(model="rwkv-4-pile-169m"):
+    tokenizer_path = Path(__file__).parent / "20B_tokenizer.json"
+    if not tokenizer_path.exists():
+        fetch_tokenizer(tokenizer_path)
+
+    model_params = models[model]
+
+    model_path = hf_hub_download(
+        repo_id=model_params["repo_id"], filename=f"{model_params['title']}.pth"
+    )
+
+    return model_path
+
+
+if __name__ == "__main__":
+    get_model_path(MODEL)
diff --git a/examples/Raven-RWKV/lib_raven.py b/examples/Raven-RWKV/lib_raven.py
new file mode 100644
index 0000000..85d7417
--- /dev/null
+++ b/examples/Raven-RWKV/lib_raven.py
@@ -0,0 +1,108 @@
+import logging
+from typing import List
+
+from get_models import MODEL, TOKENIZER_PATH, get_model_path
+
+# if RWKV_CUDA_ON='1' then use CUDA kernel for seq mode (much faster)
+# these settings must be configured before attempting to import rwkv
+from rwkv.model import RWKV
+from rwkv.utils import PIPELINE, PIPELINE_ARGS
+
+STRATEGIES = {
+    "streaming": "cuda fp16i8 *40+ -> cpu fp32 *1",  # Quite slow, take ~3gb VRAM
+    "fp16i8": "cuda fp16i8 *40 -> cpu fp32 *1",  # fits the 14b on a T4, quite fast
+    "cpu": "cpu fp32 *1",  # requires a lot of RAM
+}
+
+STRATEGY = STRATEGIES["cpu"]
+
+logger = logging.getLogger(__file__)
+
+ctx_limit = 4096
+
+
+def get_model():
+    model_path = get_model_path(MODEL)
+
+    model = RWKV(model=model_path, strategy=STRATEGY)  # stream mode w/some static
+
+    pipeline = PIPELINE(model, str(TOKENIZER_PATH))
+
+    return model, pipeline
+
+
+def generate_prompt(instruction, prompt=None):
+    if prompt:
+        return f"""Below is an instruction that describes a task, paired with an input"\
+        " that provides further context. Write a response that appropriately completes the request.
+
+# Instruction:
+{instruction}
+
+# Input:
+{prompt}
+
+# Response:
+"""
+    else:
+        return f"""Below is an instruction that describes a task. Write a response that "\
+                    "appropriately completes the request.
+
+# Instruction:
+{instruction}
+
+# Response:
+"""
+
+
+def complete(
+    instruction,
+    model,
+    pipeline: PIPELINE,
+    prompt="",
+    token_count=200,
+    temperature=1.0,
+    top_p=0.7,
+    presencePenalty=0.1,
+    countPenalty=0.1,
+    stop_words=None,
+):
+    args = PIPELINE_ARGS(
+        temperature=max(0.2, float(temperature)),
+        top_p=float(top_p),
+        alpha_frequency=countPenalty,
+        alpha_presence=presencePenalty,
+        token_ban=[],  # ban the generation of some tokens
+        token_stop=[0],
+        stop_words=stop_words,
+    )  # stop generation whenever you see any token here
+
+    for delta in pipeline.igenerate(ctx=instruction, token_count=token_count, args=args):
+        yield delta
+
+
+def embedding(
+    inputs: List[str],
+    model,
+    pipeline,
+    temperature=1.0,  # TODO remove
+    top_p=0.7,
+    presencePenalty=0.1,
+    countPenalty=0.1,
+):
+    PIPELINE_ARGS(
+        temperature=max(0.2, float(temperature)),
+        top_p=float(top_p),
+        alpha_frequency=countPenalty,
+        alpha_presence=presencePenalty,
+        token_ban=[],  # ban the generation of some tokens
+        token_stop=[0],
+    )  # stop generation whenever you see any token here
+
+    context = [pipeline.encode(ctx)[-ctx_limit:] for ctx in inputs]
+    _, state = model.forward(context[0], None)
+    *_, embedding = state
+
+    if len(embedding.shape) == 1:
+        embedding = embedding.unsqueeze(0)
+    return embedding
diff --git a/examples/Raven-RWKV/logging.conf b/examples/Raven-RWKV/logging.conf
new file mode 100644
index 0000000..894773c
--- /dev/null
+++ b/examples/Raven-RWKV/logging.conf
@@ -0,0 +1,36 @@
+[loggers]
+keys=root,uicheckapp
+
+[handlers]
+keys=consoleHandler,detailedConsoleHandler
+
+[formatters]
+keys=normalFormatter,detailedFormatter
+
+[logger_root]
+level=INFO
+handlers=consoleHandler
+
+[logger_uicheckapp]
+level=DEBUG
+handlers=detailedConsoleHandler
+qualname=uicheckapp
+propagate=0
+
+[handler_consoleHandler]
+class=StreamHandler
+level=DEBUG
+formatter=normalFormatter
+args=(sys.stdout,)
+
+[handler_detailedConsoleHandler]
+class=StreamHandler
+level=DEBUG
+formatter=detailedFormatter
+args=(sys.stdout,)
+
+[formatter_normalFormatter]
+format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s
+
+[formatter_detailedFormatter]
+format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s   call_trace=%(pathname)s L%(lineno)-4d
\ No newline at end of file
diff --git a/examples/Raven-RWKV/model.py b/examples/Raven-RWKV/model.py
new file mode 100644
index 0000000..6976559
--- /dev/null
+++ b/examples/Raven-RWKV/model.py
@@ -0,0 +1,138 @@
+import json
+import logging
+import re
+from dataclasses import dataclass
+
+import lib_raven
+import torch
+from simple_ai.api.grpc.chat.server import LanguageModel
+
+
+def format_chat_log(chat: list[dict[str, str]] = dict()) -> str:
+    raw_chat_text = ""
+    for item in chat:
+        if item["role"] not in ("user", "assistant"):
+            continue
+        role = "Bob" if item.get("role") == "user" else "Alice"
+        content = item.get("content").strip()
+        content = re.sub("\n+", "\n", content)
+
+        raw_chat_text += f"{role}: {content}\n\n"
+    return raw_chat_text + "Alice:"
+
+
+@dataclass(unsafe_hash=True)
+class RavenRWKVModel(LanguageModel):
+    gpu_id: int = 0
+    device = torch.device("cuda", gpu_id) if torch.cuda.is_available() else torch.device("cpu")
+    model, pipeline = lib_raven.get_model()
+
+    def chat(
+        self,
+        chatlog: list[list[str]] = None,
+        max_tokens: int = 512,
+        temperature: float = 0.9,
+        top_p: int = 0.5,
+        presencePenalty: int = 0.4,
+        countPenalty: int = 0.4,
+        *args,
+        **kwargs,
+    ) -> str:
+        prompt = format_chat_log(chatlog)
+        output = lib_raven.complete(
+            prompt,
+            self.model,
+            self.pipeline,
+            prompt=None,
+            token_count=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            presencePenalty=presencePenalty,
+            countPenalty=countPenalty,
+        )
+
+        output = "".join(output)
+
+        return [{"role": "raven", "content": output}]
+
+    def complete(
+        self,
+        *args,
+        **kwargs,
+    ) -> str:
+        output = self.stream_complete(*args, **kwargs)
+        output = "".join(output)
+
+        return output
+
+    def stream_complete(
+        self,
+        prompt: str = None,
+        max_tokens: int = 512,
+        temperature: float = 0.9,
+        top_p: int = 0.5,
+        presencePenalty: int = 0.4,
+        countPenalty: int = 0.4,
+        stop=None,
+        # *args,
+        **kwargs,
+    ) -> str:
+        stop = json.loads(stop)
+        output = lib_raven.complete(
+            prompt,
+            self.model,
+            self.pipeline,
+            token_count=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            presencePenalty=presencePenalty,
+            countPenalty=countPenalty,
+            stop_words=stop,
+        )
+        yield from output
+
+    def stream(
+        self,
+        chatlog: list[list[str]] = None,
+        max_tokens: int = 512,
+        temperature: float = 0.9,
+        top_p: int = 0.5,
+        presencePenalty: int = 0.4,
+        countPenalty: int = 0.4,
+        *args,
+        **kwargs,
+    ):
+        yield [{"role": "raven"}]
+
+        stop_words = ["\n\nBob:", "\n\nAlice:"]
+
+        prompt = format_chat_log(chatlog)
+        first = True
+        for delta in lib_raven.complete(
+            prompt,
+            self.model,
+            self.pipeline,
+            prompt=None,
+            token_count=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            presencePenalty=presencePenalty,
+            countPenalty=countPenalty,
+            stop_words=stop_words,
+        ):
+            clean_delta = delta
+            if first:
+                clean_delta = delta[1:]  ## remove leading whitespace in completion
+                first = False
+            yield [{"content": clean_delta}]
+
+    def embed(
+        self,
+        inputs: list = [],
+    ) -> list:
+        logging.info(f"Processing inputs : {inputs}")
+        embeddings = lib_raven.embedding(inputs, self.model, self.pipeline)
+        logging.info(
+            f"Successfully computed embeddings (shape : {embeddings.shape}) for inputs : {inputs}"
+        )
+        return embeddings.tolist()
diff --git a/examples/Raven-RWKV/requirements.txt b/examples/Raven-RWKV/requirements.txt
new file mode 100644
index 0000000..2d40d61
--- /dev/null
+++ b/examples/Raven-RWKV/requirements.txt
@@ -0,0 +1,11 @@
+inquirer==3.1.3
+loralib==0.1.1
+ninja==1.11.1
+pynvml==11.5.0
+scipy==1.10.1
+sentencepiece==0.1.97
+simple-ai-server==0.2.0
+tokenizers==0.13.2
+torch==2.0.0
+transformers==4.27.3
+git+https://github.com/Nintorac/ChatRWKV.git@parallel#subdirectory=rwkv_pip_package 
diff --git a/examples/Raven-RWKV/server.py b/examples/Raven-RWKV/server.py
new file mode 100644
index 0000000..d663326
--- /dev/null
+++ b/examples/Raven-RWKV/server.py
@@ -0,0 +1,55 @@
+import logging
+from concurrent import futures
+
+import grpc
+from model import RavenRWKVModel as Model
+from simple_ai.api.grpc.chat.server import (
+    LanguageModelServicer as ChatServicer,
+    llm_chat_pb2_grpc,
+)
+from simple_ai.api.grpc.completion.server import (
+    LanguageModelServicer as CompletionServicer,
+    llm_pb2_grpc,
+)
+from simple_ai.api.grpc.embedding.server import (
+    LanguageModelServicer as EmbeddingServicer,
+    llm_embed_pb2_grpc,
+)
+
+
+def serve(
+    address="[::]:50051",
+    chat_servicer=None,
+    embedding_servicer=None,
+    completion_servicer=None,
+    max_workers=10,
+):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
+    llm_chat_pb2_grpc.add_LanguageModelServicer_to_server(chat_servicer, server)
+    llm_embed_pb2_grpc.add_LanguageModelServicer_to_server(embedding_servicer, server)
+    llm_pb2_grpc.add_LanguageModelServicer_to_server(completion_servicer, server)
+    server.add_insecure_port(address=address)
+    server.start()
+    server.wait_for_termination()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-a", "--address", type=str, default="[::]:50051")
+    args = parser.parse_args()
+
+    logging.info(f"Starting gRPC server on {args.address}")
+    model = Model()
+    chat_servicer = ChatServicer(model=Model())
+    embedding_servicer = EmbeddingServicer(model=Model())
+    completion_servicer = CompletionServicer(model=Model())
+    serve(
+        address=args.address,
+        chat_servicer=chat_servicer,
+        embedding_servicer=embedding_servicer,
+        completion_servicer=completion_servicer,
+    )