diff --git a/.gitignore b/.gitignore
index 39b8837..c8ab742 100644
--- a/.gitignore
+++ b/.gitignore
@@ -215,3 +215,4 @@ logs/
# Resources not needed to checkin
Resources/*.json
+Resources/*.faiss
diff --git a/README.md b/README.md
index 92eea6f..853a178 100644
--- a/README.md
+++ b/README.md
@@ -39,15 +39,17 @@ RAG or Retrieval Augmented Generation is a technique used to retrieve external k
RAG is good because:
- Reduces hallucinations
-
- Enables citations
-
- Keeps answers faithful to source material
#### Chunking
[Chunker Module](atlas/core/chunker/README.md)
+#### Embedding and Indexing
+
+[Embedding Module](atlas/core/embedder/README.md)
+
#### Obsidian
[Obsidian](https://obsidian.md/) is a light weight application used to take notes and create knowledge bases. It saves all the notes as markdown making it easy to load, process and render a huge amount of notes.
@@ -76,7 +78,7 @@ So it follows the scaling law that even a small LLM when trained on enough quali
## Architecture
-Initial high level [architecture diagram](https://github.com/DivyenduDutta/Atlas/tree/master/Resources/Atlas_Architecture.png)
+High level [architecture diagram](Resources/Atlas_Architecture.png)
A sample of the `obsidian_index.json` is as below:
@@ -120,10 +122,56 @@ Before committing changes run `pre-commit run --all-files` or `pre-commit run --
Run `python .\atlas\core\ingest\obsidian_vault_processor.py`
-This will generate the `obsidian_index.json` in `/Resources` folder. This json file contains the processed data after ingesting and processing the notes from the obsidian vault.
+In the above script, modify
+- `obsidian_vault_path` to point to your obsidian vault's root folder ie, the folder containing `.obsidian` folder
+- `obsidian_index_path` to specify where the `obsidian_index.json` will be saved. This json file contains the processed data after ingesting and processing the notes from the obsidian vault. See [architecture](#architecture) section for the structure of this json.
+
+### Structural Chunker Module
+
+Run `python .\atlas\core\chunker\structural_chunker.py`
+
+In the above script, modify
+- `processed_data_path` to specify where the `obsidian_index.json` is present
+- `output_path` to specify where the `chunked_data.json` will be saved. This json file contains the chunks generated from the notes processed by the "Obsidian Vault Processor" module. See [`README` in `atlas/core/chunker`](atlas/core/chunker/README.md) for structure of this json.
+- `max_words` to set what determines the size of chunks created. This should be changed primarily based on the token limit of the encoding model and context size of the LLM used in the later modules.
+
+### Embedder Module
+
+Run `python .\atlas\core\embedder\sentence_transformer\impl_embedder.py`
+
+In the above script modify,
+- `chunk_data_path` to specify where the `chunked_data.json`is present
+- `output_path` to specify where `embedded_chunks.json` will be saved. This json is exactly similar to
+`chunked_data.json` with the added `embedding` for each chunk. See [`README` in `atlas/core/embedder`](atlas/core/embedder/README.md) for structure of this json.
+- `encoder_config_path` to specify your own configuration settings for the encoder model used to generate the chunk embeddings. By default, see [`altas/core/configs/sentence_transformer_config.yaml`](atlas/core/configs/sentence_transformer_config.yaml) for changing the encoder model used and its configuration. The following can be changed:
-See architecture section for structure of this json.
+```yaml
+model_name: sentence-transformers/all-MiniLM-L6-v2
+batch_size: 32
+normalize_embeddings: true
+device: cuda
+```
+
+### Indexer Module
+
+Run `python .\atlas\core\indexer\run_indexer.py`
+
+In the above script modify,
+- `results_save_path` to specify where the index and metadata file will be saved
+- `embedded_chunks_json_file` to specify where the `embedded_chunks.json` is present
### Tests
-Run unit tests via VS Code or `python -m unittest` to run all unit tests
+Run unit tests via VS Code
+
+or
+
+Run only unit tests - `pytest -m unittest`
+
+Run only integration tests - `pytest -m integration`
+
+Run only tests that can be run on CI - `pytest -m runonci`
+
+Run ALL tests - `pytest`
+
+Note : Anytime a pytest marker is added to a pytest, ensure it is registered in `pytest.ini` otherwise pytest will complain
diff --git a/Resources/Atlas_Architecture.drawio b/Resources/Atlas_Architecture.drawio
index 103bab7..be9cd0e 100644
--- a/Resources/Atlas_Architecture.drawio
+++ b/Resources/Atlas_Architecture.drawio
@@ -80,8 +80,16 @@
-
-
+
+
+
+
+
+
+
+
+
+
@@ -95,7 +103,7 @@
-
+
diff --git a/Resources/Atlas_Architecture.png b/Resources/Atlas_Architecture.png
index c5f0477..4777596 100644
Binary files a/Resources/Atlas_Architecture.png and b/Resources/Atlas_Architecture.png differ
diff --git a/atlas/core/chunker/base_chunker.py b/atlas/core/chunker/base_chunker.py
index 65019ba..b5b8b67 100644
--- a/atlas/core/chunker/base_chunker.py
+++ b/atlas/core/chunker/base_chunker.py
@@ -1,6 +1,8 @@
from abc import ABC
from abc import abstractmethod
from typing import List, Dict
+from pathlib import Path
+import json
from atlas.utils.logger import LoggerConfig
@@ -10,15 +12,35 @@
class BaseChunker(ABC):
"""
Abstract base class for chunkers that split processed data into smaller "retrieval units.
+
+ Args:
+ processed_data_path (str): Path to the processed data file.
+ output_path (str): Path to save the chunked data.
"""
- @abstractmethod
+ def __init__(self, processed_data_path: str, output_path: str) -> None:
+ LOGGER.info("-" * 20)
+ LOGGER.info("StructuralChunker initialized.")
+ LOGGER.info(f"Chunking processed data at {processed_data_path}")
+ self.processed_data_path = Path(processed_data_path)
+ self.output_path = Path(output_path)
+
def read_processed_data(self) -> List[Dict] | None:
"""
Read the processed data which is the output of the previous module
- ie,`KnowledgeBaseProcessor`.
+ ie, `KnowledgeBaseProcessor`.
+
+ Returns:
+ List[Dict] | None: The processed data as a list of dictionaries or None if an error occurs.
"""
- pass
+ try:
+ with open(self.processed_data_path, "r", encoding="utf-8") as file:
+ data = json.load(file)
+ LOGGER.info("Processed data successfully read.")
+ return data
+ except Exception as e:
+ LOGGER.error(f"Error reading processed data: {e}")
+ return None
@abstractmethod
def create_chunks(self, processed_data: List[Dict]) -> List[Dict]:
@@ -33,15 +55,23 @@ def create_chunks(self, processed_data: List[Dict]) -> List[Dict]:
"""
pass
- @abstractmethod
def save_chunked_data(self, chunked_data: List[Dict]) -> None:
"""
- Save the chunked data to a format suitable for later use.
+ Save the chunked data to the output path in JSON format.
+ This method writes to a temporary file first and then renames it to ensure atomicity.
+ This prevents data corruption in case of interruptions during the write process.
Args:
- chunked_data (list[dict]): The list of chunked data.
+ chunked_data (List[Dict]): The chunked data to be saved.
"""
- pass
+ self.output_path.parent.mkdir(parents=True, exist_ok=True)
+ tmp_path = self.output_path.with_suffix(".tmp")
+
+ with tmp_path.open("w", encoding="utf-8") as f:
+ json.dump(chunked_data, f, indent=2, ensure_ascii=False)
+
+ tmp_path.replace(self.output_path)
+ LOGGER.info(f"Chunks saved successfully to {str(self.output_path)}")
def chunk(self) -> None:
"""
diff --git a/atlas/core/chunker/structural_chunker.py b/atlas/core/chunker/structural_chunker.py
index b1a8edb..ddeb801 100644
--- a/atlas/core/chunker/structural_chunker.py
+++ b/atlas/core/chunker/structural_chunker.py
@@ -24,31 +24,9 @@ class StructuralChunker(BaseChunker):
def __init__(
self, processed_data_path: str, output_path: str, max_words: int
) -> None:
- LOGGER.info("-" * 20)
- LOGGER.info("StructuralChunker initialized.")
- LOGGER.info(f"Chunking processed data at {processed_data_path}")
- self.processed_data_path = Path(processed_data_path)
- self.output_path = Path(output_path)
+ super().__init__(processed_data_path, output_path)
self.max_words = max_words
- def read_processed_data(self) -> List[Dict] | None:
- """
- Read the obsidian indexed data which is the output of the previous module
- ie, `ObsidianVaultProcessor`.
-
- Returns:
- List[Dict] | None: The obsidian indexed data as a list of dictionaries or None if an error occurs.
- """
-
- try:
- with open(self.processed_data_path, "r", encoding="utf-8") as file:
- data = json.load(file)
- LOGGER.info("Obsidian indexed data successfully read.")
- return data
- except Exception as e:
- LOGGER.error(f"Error reading processed data: {e}")
- return None
-
def _split_by_word_limit(self, text: str, max_words: int) -> list[str]:
"""
Split text into chunks based on a maximum word limit.
@@ -213,22 +191,6 @@ def create_chunks(self, processed_data: List[Dict]) -> List[Dict]:
return chunks
- def save_chunked_data(self, chunked_data: List[Dict]) -> None:
- """
- Save the chunked data to the output path in JSON format.
- This method writes to a temporary file first and then renames it to ensure atomicity.
- This prevents data corruption in case of interruptions during the write process.
-
- Args:
- chunked_data (List[Dict]): The chunked data to be saved.
- """
- tmp_path = self.output_path.with_suffix(".tmp")
-
- with tmp_path.open("w", encoding="utf-8") as f:
- json.dump(chunked_data, f, indent=2, ensure_ascii=False)
-
- tmp_path.replace(self.output_path)
-
if __name__ == "__main__":
processed_data_path = r"D:\\Deep learning\\Atlas\\Resources\\obsidian_index.json"
diff --git a/atlas/core/configs/sentence_transformer_config.yaml b/atlas/core/configs/sentence_transformer_config.yaml
new file mode 100644
index 0000000..8840c7d
--- /dev/null
+++ b/atlas/core/configs/sentence_transformer_config.yaml
@@ -0,0 +1,4 @@
+model_name: sentence-transformers/all-MiniLM-L6-v2
+batch_size: 32
+normalize_embeddings: true
+device: cuda
diff --git a/atlas/core/embedder/README.md b/atlas/core/embedder/README.md
new file mode 100644
index 0000000..7aba23a
--- /dev/null
+++ b/atlas/core/embedder/README.md
@@ -0,0 +1,61 @@
+## Embedder Module
+
+LLM's dont really understand text. Hence, the text needs to be converted to a numeric representation, more specifically a vector called embedding. This is just a numeric representation in a low dimensional space. Two vectors close to each other in this space represent two texts which are close to each other semantically.
+
+### Encoder Model Choice
+
+`sentence-transformers/all-MiniLM-L6-v2` from [Sentence Transformers](https://www.sbert.net/) was chosen because its,
+- fast and lightweight (super important for latency)
+- provides really good [semantic search](https://www.sbert.net/examples/sentence_transformer/applications/semantic-search/README.html#background) performance
+
+
+The encoder model is ultimately used for semantic search.
+
+#### What is Semantic Search?
+
+1. Take chunks → embed into vector space
+2. Take query → embed into same space
+3. Find nearest neighbors (cosine / dot / L2)
+4. Return top-k chunks
+
+#### Why not use TinyLLama's encoder
+
+- There are three types of Transformer models
+ - Encoder only models
+ - eg, BERT, ROBERTa, MiniLM
+ - Decoder only models
+ - LLama/TinyLlama/GPT-2
+ - they dont have an explicit encoder model in their architecture but they do encoding on text internally
+ - Encoder - Decoder models
+ - BART, T5, FLAN
+
+- TinyLlama being a decoder only model is specifically trained for next token prediction (the encoding is still done but its not the main focus and it does not have an encoder in the architectural sense).
+- Whereas encoder only models are specifically trained generate embeddings and further use cases of embeddings (like retrieval, semantic search)
+
+#### Structure of embedding chunks json
+
+```json
+[
+ {
+ "chunk_id": "folder/sample note.md::Heading 1::0",
+ "note_id": "folder/sample note.md",
+ "title": "sample note",
+ "relative_path": "folder/sample note.md",
+ "heading": "Heading 1",
+ "chunk_index": 0,
+ "text": "lorem ipsum",
+ "word_count": 2,
+ "tags": [],
+ "frontmatter": {},
+ "embedding": [
+ 0.017203988507390022,
+ 0.06233978644013405,
+ -0.011157829314470291,
+ -0.012113398872315884,
+ ...
+ ]
+ },
+ ...
+]
+```
+- This is same as the json output of the chunker module with the added `embedding` key. This represents the vector representation of the `text` as provided by the chosen encoder model.
diff --git a/atlas/core/ingest/__init__.py b/atlas/core/embedder/__init__.py
similarity index 100%
rename from atlas/core/ingest/__init__.py
rename to atlas/core/embedder/__init__.py
diff --git a/atlas/core/embedder/base/__init__.py b/atlas/core/embedder/base/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/atlas/core/embedder/base/base_embedder.py b/atlas/core/embedder/base/base_embedder.py
new file mode 100644
index 0000000..ae9b21b
--- /dev/null
+++ b/atlas/core/embedder/base/base_embedder.py
@@ -0,0 +1,95 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import List, Dict
+from pathlib import Path
+import json
+
+from atlas.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+
+class BaseEmbedder(ABC):
+ """
+ Abstract base class for all embedder implementations.
+
+ Args:
+ chunk_data_path (str): Path to the chunk data file.
+ output_path (str): Path to save the embedded chunks.
+ encoder_config_path (str): Path to the encoder configuration file.
+ """
+
+ def __init__(
+ self, chunk_data_path: str, output_path: str, encoder_config_path: str
+ ):
+ LOGGER.info("-" * 20)
+ LOGGER.info("Initializing Embedder.")
+ self.chunk_data_path = Path(chunk_data_path)
+ self.output_path = Path(output_path)
+ self.encoder_config_path = Path(encoder_config_path)
+ self.load_encoder()
+
+ def read_chunk_data(self) -> List[Dict] | None:
+ """
+ Load chunk data to be embedded.
+
+ Args:
+ List[Dict]: List of chunk dictionaries to be embedded.
+ """
+
+ LOGGER.info(f"Loading chunk data from {self.chunk_data_path}")
+ try:
+ with self.chunk_data_path.open("r", encoding="utf-8") as f:
+ chunk_data = json.load(f)
+ LOGGER.info(f"Loaded {len(chunk_data)} chunks for embedding.")
+ return chunk_data
+ except Exception as e:
+ LOGGER.error(f"Error reading chunk data file: {e}")
+ return None
+
+ @abstractmethod
+ def load_encoder(self) -> None:
+ """
+ Load the encoder model.
+ """
+ pass
+
+ def embed(self) -> None:
+ """
+ Main method to perform the embedding process.
+ """
+ chunks = self.read_chunk_data()
+ assert chunks is not None, "Chunk data read should be present."
+ embedded_chunks = self.embed_chunks(chunks)
+ self.save_embedded_chunks(embedded_chunks)
+ LOGGER.info("Embedding process completed.")
+
+ @abstractmethod
+ def embed_chunks(self, chunks: List[Dict]) -> List[Dict]:
+ """
+ Embed the provided chunks using the loaded encoder.
+
+ Args:
+ chunks (List[Dict]): List of chunk dictionaries to be embedded.
+
+ Returns:
+ List[Dict]: List of chunk dictionaries with added embeddings.
+ """
+ pass
+
+ def save_embedded_chunks(self, embedded_chunks: List[Dict]) -> None:
+ """
+ Save the embedded chunks to a suitable format for later use.
+
+ Args:
+ embedded_chunks (List[Dict]): List of chunk dictionaries with added embeddings.
+ """
+
+ self.output_path.parent.mkdir(parents=True, exist_ok=True)
+ tmp_path = self.output_path.with_suffix(".tmp")
+
+ with tmp_path.open("w", encoding="utf-8") as f:
+ json.dump(embedded_chunks, f, indent=2, ensure_ascii=False)
+
+ tmp_path.replace(self.output_path)
+ LOGGER.info(f"Embedded chunks saved successfully to {str(self.output_path)}")
diff --git a/atlas/core/embedder/base/base_encoder.py b/atlas/core/embedder/base/base_encoder.py
new file mode 100644
index 0000000..16a66c1
--- /dev/null
+++ b/atlas/core/embedder/base/base_encoder.py
@@ -0,0 +1,33 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import List
+import numpy as np
+
+from atlas.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+
+class BaseEncoder(ABC):
+ """Abstract base class for the encoder wrapper."""
+
+ @abstractmethod
+ def load(self) -> None:
+ """
+ Loads the encoder model.
+
+ """
+ pass
+
+ @abstractmethod
+ def encode(self, texts: List[str]) -> np.ndarray:
+ """
+ Encodes a list of texts into their corresponding embeddings.
+
+ Args:
+ texts (list[str]): A list of texts to be encoded.
+
+ Returns:
+ np.ndarray: An array of embeddings corresponding to the input texts.
+ """
+ pass
diff --git a/atlas/core/embedder/config.py b/atlas/core/embedder/config.py
new file mode 100644
index 0000000..c3720d9
--- /dev/null
+++ b/atlas/core/embedder/config.py
@@ -0,0 +1,39 @@
+from dataclasses import dataclass
+import yaml
+from pathlib import Path
+
+from atlas.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+
+@dataclass
+class EncoderConfig:
+ """Configuration parameters for the encoder."""
+
+ model_name: str
+ batch_size: int
+ normalize_embeddings: bool
+ device: str
+
+
+def load_encoder_config(path: Path) -> EncoderConfig:
+ """
+ Load encoder configuration from a YAML file.
+
+ Args:
+ path (Path): Path to the encoder YAML configuration file.
+ """
+ try:
+ with path.open("r", encoding="utf-8") as f:
+ data = yaml.safe_load(f)
+ except FileNotFoundError as e:
+ LOGGER.error(f"Encoder configuration file not found: {path}")
+ raise FileNotFoundError(f"Encoder configuration file not found: {path}")
+
+ if not data:
+ LOGGER.error(f"Encoder configuration file is empty: {path}")
+ raise ValueError(f"Encoder configuration file is empty: {path}")
+
+ LOGGER.info(f"Encoder configuration loaded successfully from {path}")
+ return EncoderConfig(**data)
diff --git a/atlas/core/embedder/sentence_transformer/__init__.py b/atlas/core/embedder/sentence_transformer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/atlas/core/embedder/sentence_transformer/impl_embedder.py b/atlas/core/embedder/sentence_transformer/impl_embedder.py
new file mode 100644
index 0000000..008649e
--- /dev/null
+++ b/atlas/core/embedder/sentence_transformer/impl_embedder.py
@@ -0,0 +1,72 @@
+import os
+from typing import List, Dict
+
+from atlas.core.embedder.base.base_embedder import BaseEmbedder
+from atlas.core.embedder.config import load_encoder_config
+from atlas.core.embedder.sentence_transformer.impl_encoder import (
+ SentenceTransformerEncoder,
+)
+from atlas.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+
+class SentenceTransformerEmbedder(BaseEmbedder):
+ """Embedder implementation using Sentence Transformers."""
+
+ def __init__(
+ self, chunk_data_path: str, output_path: str, encoder_config_path: str
+ ):
+ super().__init__(chunk_data_path, output_path, encoder_config_path)
+
+ def load_encoder(self) -> None:
+ """Load the Sentence Transformer encoder model."""
+
+ embedding_config = load_encoder_config(self.encoder_config_path)
+ encoder = SentenceTransformerEncoder(embedding_config)
+ self.encoder = encoder
+
+ def embed_chunks(self, chunks: List[Dict]) -> List[Dict]:
+ """
+ Embed the provided chunks using the loaded encoder.
+
+ Args:
+ chunks (List[Dict]): List of chunk dictionaries to be embedded.
+
+ Returns:
+ List[Dict]: List of chunk dictionaries with added embeddings.
+ """
+ if not chunks:
+ LOGGER.warning("No chunks provided for embedding.")
+ return []
+
+ texts = [chunk["text"] for chunk in chunks]
+
+ embeddings = self.encoder.encode(texts)
+
+ if len(embeddings) != len(chunks):
+ LOGGER.error("Embedding count does not match chunk count.")
+ raise ValueError("Embedding count does not match chunk count")
+
+ embedded_chunks = []
+
+ for chunk, embedding in zip(chunks, embeddings):
+ embedded_chunk = {
+ **chunk,
+ "embedding": embedding.tolist(), # Convert numpy array to list for JSON serialization
+ }
+ embedded_chunks.append(embedded_chunk)
+
+ return embedded_chunks
+
+
+if __name__ == "__main__":
+ chunk_data_path = r"D:\\Deep learning\\Atlas\\Resources\\chunked_data.json"
+ output_path = r"D:\\Deep learning\\Atlas\\Resources\\embedded_chunks.json"
+ encoder_config_path = os.path.join(
+ os.getcwd(), "atlas", "core", "configs", "sentence_transformer_config.yaml"
+ )
+ embedder = SentenceTransformerEmbedder(
+ chunk_data_path, output_path, encoder_config_path
+ )
+ embedder.embed()
diff --git a/atlas/core/embedder/sentence_transformer/impl_encoder.py b/atlas/core/embedder/sentence_transformer/impl_encoder.py
new file mode 100644
index 0000000..699e442
--- /dev/null
+++ b/atlas/core/embedder/sentence_transformer/impl_encoder.py
@@ -0,0 +1,61 @@
+from sentence_transformers import SentenceTransformer
+import numpy as np
+import torch
+
+from atlas.core.embedder.base.base_encoder import BaseEncoder
+from atlas.core.embedder.config import EncoderConfig
+from atlas.utils.logger import LoggerConfig
+
+from typing import List
+
+LOGGER = LoggerConfig().logger
+
+
+class SentenceTransformerEncoder(BaseEncoder):
+ """
+ Sentence Transformer Encoder Wrapper.
+
+ Args:
+ config (EncoderConfig): Configuration for the encoder.
+ """
+
+ def __init__(self, config: EncoderConfig):
+ LOGGER.info("-" * 20)
+ LOGGER.info("Initializing Sentence Transformer Encoder Wrapper")
+ self.config = config
+ self.model: SentenceTransformer | None = None
+ self.load()
+
+ def load(self) -> None:
+ """Load the Sentence Transformer model."""
+ if self.model is not None:
+ return
+
+ if self.config.device == "cuda":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ self.model = SentenceTransformer(self.config.model_name, device=device)
+ LOGGER.info(f"Loaded Sentence Transformer model: {self.config.model_name}")
+
+ def encode(self, texts: List[str]) -> np.ndarray:
+ """
+ Encode a List of texts into embeddings.
+
+ Args:
+ texts (List[str]): List of texts to encode.
+
+ Returns:
+ np.ndarray: Array of embeddings.
+ """
+ LOGGER.info(f"Encoding {len(texts)} texts using Sentence Transformer model.")
+
+ assert self.model is not None, "Model must be loaded before encoding"
+
+ embeddings = self.model.encode(
+ texts,
+ batch_size=self.config.batch_size,
+ show_progress_bar=True,
+ normalize_embeddings=self.config.normalize_embeddings,
+ )
+
+ return embeddings
diff --git a/atlas/core/indexer/README.md b/atlas/core/indexer/README.md
new file mode 100644
index 0000000..c7c50a9
--- /dev/null
+++ b/atlas/core/indexer/README.md
@@ -0,0 +1,29 @@
+## Indexer Module
+
+Indexing as a general concept is used to enhance speed of search and retrieval (ie, lookup) at the expense of storage space. And an index is a data structure that allows us to do this.A
+
+> We could scan all our data every time.
+> An index exists so we don’t have to.
+
+Its used in various places like,
+- databases
+- search engines
+- vector stores
+- compilers
+
+### Why do indexing if I can just use the encoder to give me top K chunks?
+
+We can skip indexing when our data is small. We do indexing because computing similarity against everything doesn’t scale. Indexing exists to make nearest-neighbor search fast.
+
+When we use the encoder to get the top K chunks for a query, internally it ends up comparing against all the vector embeddings we have.
+
+```bash
+for each chunk:
+ similarity(query_embedding, chunk_embedding)
+```
+
+Time Complexity : `O(N × d)` where `N` = number of chunks and `d` = embedding dimension
+
+### Vector indexing library
+
+[FAISS](https://faiss.ai/index.html), a vector indexing library was chosen instead of a vector DB to have more control when building the vector index.
diff --git a/atlas/core/indexer/__init__.py b/atlas/core/indexer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/atlas/core/indexer/base_vector_store.py b/atlas/core/indexer/base_vector_store.py
new file mode 100644
index 0000000..26ef269
--- /dev/null
+++ b/atlas/core/indexer/base_vector_store.py
@@ -0,0 +1,66 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import List, Dict
+import numpy as np
+
+
+class BaseVectorStore(ABC):
+ """
+ Abstract base class for a vector store. Used for vector indexing.
+ """
+
+ @abstractmethod
+ def add(self, vectors: np.ndarray, metadata: List[Dict]) -> None:
+ """
+ Add the vector embeddings to the vector index and the corresponding
+ chunks to the metatdata.
+
+ Args:
+ vectors (np.ndarray): Vector embeddings to add to the vector store.
+ metadata (List[Dict]): Corresponding list of chunk dictionaries.
+ """
+ pass
+
+ @abstractmethod
+ def search(self, query_vector: np.ndarray, k: int) -> List[Dict]:
+ """
+ Search a query (via its embedding) in the vector store.
+
+ Args:
+ query_vector (np.ndarray): Embedding of the query to search in the vector store.
+ k (int): Number of most similar embeddings (aka neighbors) to the query vector.
+
+ Returns:
+ List[Dict]: List of dictionaries of the most similar embeddings to the query vector.
+ """
+ pass
+
+ @abstractmethod
+ def save(self, results_save_path: str):
+ """
+ Save the following two files:
+ 1. index file
+ 2. chunk metadata
+
+ Ensure that the elements in the two files are in sync
+ ie, `vector ID <-> metadata list index`
+
+ Args:
+ results_save_path (str): Directory to save the above mentioned two result files.
+ """
+ pass
+
+ @abstractmethod
+ def load(self, results_load_path: str):
+ """
+ Load the following two files:
+ 1. index file
+ 2. chunk metadata
+
+ Use this in case we dont want to build the index and metadata from scratch and already
+ have both of them saved.
+
+ Args:
+ results_load_path (str): Directory to load the above mentioned two result files from.
+ """
+ pass
diff --git a/atlas/core/indexer/faiss_vector_store.py b/atlas/core/indexer/faiss_vector_store.py
new file mode 100644
index 0000000..a14a530
--- /dev/null
+++ b/atlas/core/indexer/faiss_vector_store.py
@@ -0,0 +1,160 @@
+import faiss
+import numpy as np
+from typing import List, Dict
+from pathlib import Path
+import json
+
+from atlas.core.indexer.base_vector_store import BaseVectorStore
+from atlas.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+
+class FaissVectorStore(BaseVectorStore):
+ """
+ Vector store using FAISS (Facebook AI Semantic Search) library.
+ Currently uses Flat Indexing but can be changed as needed.
+
+ Args:
+ dim (int): Number of dimensions of the embeddings/vectors.
+ """
+
+ def __init__(self, dim: int):
+ LOGGER.info("-" * 20)
+ LOGGER.info("Initializing Indexer.")
+ self.dim = dim
+ self.index = faiss.IndexFlatIP(dim)
+ self.metadata: List[Dict] = []
+
+ def add(self, vectors: np.ndarray, metadata: List[Dict]) -> None:
+ """
+ Add the vector embeddings to the FAISS vector index and the corresponding
+ chunks to the metatdata.
+
+ Args:
+ vectors (np.ndarray): Vector embeddings to add to the FAISS vector store.
+ metadata (List[Dict]): Corresponding list of chunk dictionaries.
+ """
+
+ if vectors.ndim != 2 or vectors.shape[1] != self.dim:
+ LOGGER.error(
+ f"Invalid vector shape. Expected num of dim = 2 and size of vector = {self.dim}"
+ )
+ raise ValueError(
+ f"Invalid vector shape. Expected num of dim = 2 and size of vector = {self.dim}"
+ )
+
+ if len(vectors) != len(metadata):
+ LOGGER.error("Vectors and metadata length mismatch")
+ raise ValueError("Vectors and metadata length mismatch")
+
+ # we append metadata in the same order we add vectors
+ # this is to enforce the invariant `FAISS vector ID <-> metadata list index`
+ # this also means that when passing `vectors` and `metadata` to `add()`,
+ # they need to by synced
+ self.index.add(vectors)
+ self.metadata.extend(metadata)
+
+ def search(self, query_vector: np.ndarray, k: int) -> List[Dict]:
+ """
+ Search a query (via its embedding/vector) in the FAISS vector store.
+
+ Args:
+ query_vector (np.ndarray): Embedding of the query to search in the FAISS vector store.
+ k (int): Number of most similar embeddings (aka neighbors) to the query vector.
+
+ Returns:
+ List[Dict]: List of dictionaries of the most similar embeddings to the query vector.
+ Each dictionary contains the score (probability) for each similar embedding
+ match along with the full chunk metadata.
+ """
+
+ if k > self.index.ntotal:
+ LOGGER.error(f"k is more than maximum possible value : {self.index.ntotal}")
+ raise Exception(
+ f"k is more than maximum possible value : {self.index.ntotal}"
+ )
+
+ if query_vector.ndim == 1:
+ query_vector = query_vector.reshape(
+ 1, -1
+ ) # add first dimension as batch == 1
+
+ scores, indices = self.index.search(query_vector, k)
+
+ # search() returns two arrays:
+ # scores: shape (n_queries, k)
+ # indices: shape (n_queries, k)
+
+ results = []
+ for score, idx in zip(
+ scores[0], indices[0]
+ ): # use 0th element because we have 1 query vector
+ if idx == -1: # guard for neighbor not found for given query vector
+ continue
+
+ result = {"score": float(score), **self.metadata[idx]}
+ results.append(result)
+
+ LOGGER.info(f"Number of similar embeddings found : {len(results)}")
+ LOGGER.info(
+ f"Chunk with highest match : {results[0]['score']} is {results[0]['chunk_id']}"
+ )
+ return results
+
+ def save(self, results_save_path: str) -> None:
+ """
+ Save the following two files:
+ 1. index file -> index.faiss
+ 2. chunk metadata -> metadata.json
+
+ Ensure that the elements in the two files are in sync
+ ie, `FAISS vector ID <-> metadata list index`
+
+ Args:
+ results_save_path (str): Directory to save the above mentioned two result files.
+ """
+
+ _results_save_path = Path(results_save_path)
+ _results_save_path.mkdir(parents=True, exist_ok=True)
+
+ faiss.write_index(self.index, str(_results_save_path / "index.faiss"))
+
+ metadata_save_path = _results_save_path / "metadata.json"
+ tmp_path = metadata_save_path.with_suffix(".tmp")
+ with (tmp_path).open("w", encoding="utf-8") as f:
+ json.dump(self.metadata, f, indent=2, ensure_ascii=False)
+
+ tmp_path.replace(metadata_save_path)
+ LOGGER.info(
+ f"Index file and chunk metadata saved successfully to directory : {results_save_path}"
+ )
+
+ def load(self, results_load_path: str) -> None:
+ """
+ Load the following two files:
+ 1. index file -> index.faiss
+ 2. chunk metadata -> metadata.json
+
+ Use this in case we dont want to build the index and metadata from scratch and already
+ have both of them saved.
+
+ Args:
+ results_load_path (str): Directory to load the above mentioned two result files from.
+ """
+
+ _results_load_path = Path(results_load_path)
+ try:
+ self.index = faiss.read_index(str(_results_load_path / "index.faiss"))
+
+ with (_results_load_path / "metadata.json").open(
+ "r", encoding="utf-8"
+ ) as f:
+ self.metadata = json.load(f)
+
+ LOGGER.info(
+ f"Index file and chunk metadata loaded successfully from directory : {results_load_path}"
+ )
+ except Exception as e:
+ LOGGER.error(f"Error reading either index file or metadata file : {e}")
+ raise Exception(f"Error reading either index file or metadata file : {e}")
diff --git a/atlas/core/indexer/run_indexer.py b/atlas/core/indexer/run_indexer.py
new file mode 100644
index 0000000..f76811a
--- /dev/null
+++ b/atlas/core/indexer/run_indexer.py
@@ -0,0 +1,47 @@
+import os
+import numpy as np
+
+from atlas.utils.embedder_utils import load_embedded_chunks, generate_embedding
+from atlas.core.indexer.faiss_vector_store import FaissVectorStore
+
+from atlas.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+if __name__ == "__main__":
+ LOGGER.info("Running indexer to save the chunk embeddings to a vector index")
+ # this is the root folder which saves the following 2 files:
+ # 1. index file
+ # 2. metadata json
+ results_save_path = r"D:\\Deep learning\\Atlas\\Resources"
+ store = FaissVectorStore(
+ dim=384
+ ) # the encoder model we used generated embeddings of size 384
+ embedded_chunks_json_file = (
+ r"D:\\Deep learning\\Atlas\\Resources\\embedded_chunks.json"
+ )
+ embedded_chunks = load_embedded_chunks(embedded_chunks_json_file)
+
+ store.add(
+ vectors=np.array([chunk["embedding"] for chunk in embedded_chunks]),
+ metadata=embedded_chunks,
+ )
+
+ store.save(results_save_path)
+
+ # Sanity checks
+ # query_text = "Role of luck in life" # exact phrase query
+ # query_text = "Folks who inspire me" # paraphrasing
+ query_text = (
+ "Journey is more important that the final result in life" # paraphrasing
+ )
+ encoder_config_path = os.path.join(
+ os.getcwd(), "atlas", "core", "configs", "sentence_transformer_config.yaml"
+ )
+ query_vector = generate_embedding(query_text, encoder_config_path)
+ results = store.search(query_vector, k=5)
+ LOGGER.info(len(results))
+ for res in results:
+ LOGGER.info(f"score: {res['score']}")
+ LOGGER.info(f"Note title: {res['chunk_id']}")
+ LOGGER.info("===\n")
diff --git a/atlas/core/ingester/__init__.py b/atlas/core/ingester/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/atlas/core/ingest/base_file_processor.py b/atlas/core/ingester/base_file_processor.py
similarity index 61%
rename from atlas/core/ingest/base_file_processor.py
rename to atlas/core/ingester/base_file_processor.py
index b1aeaf8..da75748 100644
--- a/atlas/core/ingest/base_file_processor.py
+++ b/atlas/core/ingester/base_file_processor.py
@@ -2,6 +2,7 @@
from abc import abstractmethod
from typing import List, Dict
from pathlib import Path
+import json
from atlas.utils.logger import LoggerConfig
@@ -11,8 +12,16 @@
class KnowledgeBaseProcessor(ABC):
"""
Abstract base class for processors that handle knowledge base files.
+
+ Args:
+ vault_path (str): Path to the knowledge base.
+ output_path (str): Path to save the processed data.
"""
+ def __init__(self, vault_path: str, output_path: str) -> None:
+ self.vault_path = Path(vault_path)
+ self.output_path = Path(output_path)
+
@abstractmethod
def precheck(self) -> bool:
"""
@@ -32,15 +41,22 @@ def process(self) -> List[Dict]:
"""
pass
- @abstractmethod
def save_processed_data(self, processed_data: List[Dict]) -> None:
"""
- Save the processed data to a format suitable for later use.
+ Save the processed data to a JSON file atomically.
+ This ensures that the file is either fully written or not written at all.
Args:
- notes (list[dict]): The list of parsed metadata ie, processed data.
+ processed_data (list[dict]): The list of parsed notes metadata.
"""
- pass
+ self.output_path.parent.mkdir(parents=True, exist_ok=True)
+ tmp_path = self.output_path.with_suffix(".tmp")
+
+ with tmp_path.open("w", encoding="utf-8") as f:
+ json.dump(processed_data, f, indent=2, ensure_ascii=False)
+
+ tmp_path.replace(self.output_path)
+ LOGGER.info(f"Processed data successfully to {str(self.output_path)}")
def ingest(self) -> None:
"""
diff --git a/atlas/core/ingest/obsidian_vault_processor.py b/atlas/core/ingester/obsidian_vault_processor.py
similarity index 89%
rename from atlas/core/ingest/obsidian_vault_processor.py
rename to atlas/core/ingester/obsidian_vault_processor.py
index 973f0fb..b3fba2b 100644
--- a/atlas/core/ingest/obsidian_vault_processor.py
+++ b/atlas/core/ingester/obsidian_vault_processor.py
@@ -1,6 +1,6 @@
from datetime import date
from atlas.utils.logger import LoggerConfig
-from atlas.core.ingest.base_file_processor import KnowledgeBaseProcessor
+from atlas.core.ingester.base_file_processor import KnowledgeBaseProcessor
from pathlib import Path
from datetime import date, datetime
@@ -13,13 +13,7 @@
class ObsidianVaultProcessor(KnowledgeBaseProcessor):
- """
- Processor for Obsidian Vaults to extract notes metadata.
-
- Args:
- vault_path (str): Path to the Obsidian vault.
- output_path (str): Path to save the processed data (obsidian indexed data).
- """
+ """Processor for Obsidian Vaults to extract notes metadata."""
_OBSIDIAN_CONFIG_FILES = {
"app.json",
@@ -28,11 +22,10 @@ class ObsidianVaultProcessor(KnowledgeBaseProcessor):
}
def __init__(self, vault_path: str, output_path: str) -> None:
+ super().__init__(vault_path, output_path)
LOGGER.info("-" * 20)
LOGGER.info("ObsidianVaultProcessor initialized.")
LOGGER.info(f"Obsidian Vault to be processed: {vault_path}")
- self.vault_path = Path(vault_path)
- self.output_path = Path(output_path)
def _find_vault_root(self) -> Path | None:
"""
@@ -102,6 +95,12 @@ def precheck(self) -> bool:
return is_valid_obsidian_vault
def _normalize_yaml(self, obj):
+ """
+ Recursively normalize YAML data by converting date and datetime objects to ISO format strings.
+
+ Args:
+ obj: The YAML data to normalize.
+ """
if isinstance(obj, dict):
return {k: self._normalize_yaml(v) for k, v in obj.items()}
elif isinstance(obj, list):
@@ -217,21 +216,6 @@ def _parse_markdown_note(self, note_path: Path, vault_path: Path) -> Dict[str, A
"word_count": len(body.split()),
}
- def save_processed_data(self, processed_data: List[Dict]) -> None:
- """
- Save the processed data to a JSON file atomically.
- This ensures that the file is either fully written or not written at all.
-
- Args:
- processed_data (list[dict]): The list of parsed notes metadata.
- """
- tmp_path = self.output_path.with_suffix(".tmp")
-
- with tmp_path.open("w", encoding="utf-8") as f:
- json.dump(processed_data, f, indent=2, ensure_ascii=False)
-
- tmp_path.replace(self.output_path)
-
def process(self) -> list[dict]:
"""
Process the Obsidian vault to extract notes metadata.
diff --git a/atlas/utils/embedder_utils.py b/atlas/utils/embedder_utils.py
new file mode 100644
index 0000000..7428f9d
--- /dev/null
+++ b/atlas/utils/embedder_utils.py
@@ -0,0 +1,56 @@
+import json
+from pathlib import Path
+from typing import List, Dict
+import numpy as np
+
+from atlas.core.embedder.config import load_encoder_config
+from atlas.core.embedder.sentence_transformer.impl_encoder import (
+ SentenceTransformerEncoder,
+)
+
+from atlas.utils.logger import LoggerConfig
+
+LOGGER = LoggerConfig().logger
+
+
+def load_embedded_chunks(path: str) -> List[Dict]:
+ """
+ Load and return the list of chunk dictionaries with added embeddings.
+
+ Args:
+ path (str): Path to the list of chunk dictionaries json file.
+
+ Returns:
+ List[Dict]: The list of chunk dictionaries with added embeddings.
+ """
+
+ _path = Path(path)
+ try:
+ with (_path).open("r", encoding="utf-8") as f:
+ metadata = json.load(f)
+ except Exception as e:
+ LOGGER.error(f"Error loading embedded chunks json file : {e}")
+ raise Exception(f"Error loading embedded chunks json file : {e}")
+
+ LOGGER.info(f"Embedded chunks json successfully loaded from {str(path)}")
+ return metadata
+
+
+def generate_embedding(text: str, encoder_config_path: str) -> np.ndarray:
+ """
+ Generate the embedding/vector for a given text using the configuration settings
+ for a specific encoder.
+
+ Args:
+ text (str): `text` for which to generate embeddings.
+ encoder_config_path (str): Path to the configuration settings file for the encoder.
+
+ Returns:
+ np.ndarray: Embedding/vector for the provided `text`.
+ """
+ texts = [text]
+ _encoder_config_path = Path(encoder_config_path)
+ embedding_config = load_encoder_config(_encoder_config_path)
+ encoder = SentenceTransformerEncoder(embedding_config)
+ embedding = encoder.encode(texts)
+ return embedding[0]
diff --git a/environment.yml b/environment.yml
index 8e1873c..12981f2 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,6 +5,8 @@ channels:
dependencies:
# Python version
- python=3.11
+ - faiss-gpu
+ - numpy
# Hugging Face libraries and pytorch
- pip
- pip:
@@ -20,3 +22,4 @@ dependencies:
- types-PyYAML==6.0.12.20250809
- pytest==8.3.5
- pytest-cov==6.2.0
+ - sentence-transformers
diff --git a/notebooks/explore_encoder_model.ipynb b/notebooks/explore_encoder_model.ipynb
new file mode 100644
index 0000000..2a2221c
--- /dev/null
+++ b/notebooks/explore_encoder_model.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0",
+ "metadata": {},
+ "source": [
+ "Based on [this](https://www.sbert.net/examples/sentence_transformer/applications/semantic-search/README.html#background) code from their offical website"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1",
+ "metadata": {},
+ "source": [
+ "This does semantic search (useful for retrieval) which is an application of generating embeddings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "\n",
+ "from sentence_transformers import SentenceTransformer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embedder = SentenceTransformer(\"all-MiniLM-L6-v2\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Corpus with example documents\n",
+ "corpus = [\n",
+ " \"Machine learning is a field of study that gives computers the ability to learn without being explicitly programmed.\",\n",
+ " \"Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning.\",\n",
+ " \"Neural networks are computing systems vaguely inspired by the biological neural networks that constitute animal brains.\",\n",
+ " \"Mars rovers are robotic vehicles designed to travel on the surface of Mars to collect data and perform experiments.\",\n",
+ " \"The James Webb Space Telescope is the largest optical telescope in space, designed to conduct infrared astronomy.\",\n",
+ " \"SpaceX's Starship is designed to be a fully reusable transportation system capable of carrying humans to Mars and beyond.\",\n",
+ " \"Global warming is the long-term heating of Earth's climate system observed since the pre-industrial period due to human activities.\",\n",
+ " \"Renewable energy sources include solar, wind, hydro, and geothermal power that naturally replenish over time.\",\n",
+ " \"Carbon capture technologies aim to collect CO2 emissions before they enter the atmosphere and store them underground.\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Use \"convert_to_tensor=True\" to keep the tensors on GPU (if available)\n",
+ "corpus_embeddings = embedder.encode_document(corpus, convert_to_tensor=True)\n",
+ "print(f\"Corpus embeddings shape: {corpus_embeddings.shape}\")\n",
+ "\n",
+ "# Query sentences:\n",
+ "queries = [\n",
+ " \"How do artificial neural networks work?\",\n",
+ " \"What technology is used for modern space exploration?\",\n",
+ " \"How can we address climate change challenges?\",\n",
+ "]\n",
+ "\n",
+ "# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity\n",
+ "top_k = min(5, len(corpus))\n",
+ "for query in queries:\n",
+ " query_embedding = embedder.encode_query(query, convert_to_tensor=True)\n",
+ " print(f\" Query embedding shape: {query_embedding.shape}\")\n",
+ "\n",
+ " # We use cosine-similarity and torch.topk to find the highest 5 scores\n",
+ " similarity_scores = embedder.similarity(query_embedding, corpus_embeddings)[0]\n",
+ " scores, indices = torch.topk(similarity_scores, k=top_k)\n",
+ "\n",
+ " print(\"\\nQuery:\", query)\n",
+ " print(\"Top 5 most similar sentences in corpus:\")\n",
+ "\n",
+ " for score, idx in zip(scores, indices):\n",
+ " print(f\"(Score: {score:.4f})\", corpus[idx])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6",
+ "metadata": {},
+ "source": [
+ "So this encoder model encodes the text to a 384 dim space"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "atlas",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/unittests/scripts/conftest.py b/tests/unittests/scripts/conftest.py
new file mode 100644
index 0000000..d8841da
--- /dev/null
+++ b/tests/unittests/scripts/conftest.py
@@ -0,0 +1,165 @@
+from pathlib import Path
+import yaml
+import pytest
+import json
+
+from atlas.core.embedder.config import load_encoder_config, EncoderConfig
+
+
+@pytest.fixture
+def dummy_processed_data_path(tmp_path: Path) -> Path:
+ """
+ Create a dummy processed data file for testing.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+
+ Returns:
+ Path: The path to the created dummy processed data file.
+ """
+ dummy_data = [
+ {
+ "note_id": "_learning about me/what I learnt about myself when dealing with ADHD.md",
+ "title": "what I learnt about myself when dealing with ADHD",
+ "relative_path": "_learning about me/what I learnt about myself when dealing with ADHD.md",
+ "raw_text": "\n- aim to do less, end up doing more\n- breakdown tasks into initial granular nested problems\n- work in iterations. Get 80% value from 20% of things, Pareto's principle\n- do long things, delay instant gratification. Avoid seeking novelty. Finish reading one book, watch one big video, play one game.\n- morning and night routines\n- just open things up, set things up, write in notepad ++ then take a break. Dont have to start immediately\n- do a little everyday in a robust consistent manner. Dont have to do it all in one day, in this one session. Takes time. Be patient but consistent\n- systems thinker - seeing the whole picture and its parts as early as possible - related to working in iterations",
+ "frontmatter": {},
+ "headings": [],
+ "tags": [],
+ "wikilinks": [],
+ "word_count": 127,
+ },
+ {
+ "note_id": "_Meditations/Games and Life.md",
+ "title": "Games and Life",
+ "relative_path": "_Meditations/Games and Life.md",
+ "raw_text": "\n### Roguelikes and Life\n- **Learning** more about the thing that's causing us _problems/scaring us/making us uncomfortable_ will help us deal with the problem better\n- For example, in Darkest Dungeon, stress is a huge problem for me but as I learn more about it, the fear changes into enthusiasm because I understand things better\n\t- So we try to apply the same in life as well\n---\n- Frame _questions_ regarding the problems Im currently facing\n\t- and _answer_ them\n\t\t- if you dont know the answer, search online or ask someone\n---\n- Learn from mistakes in life\n\t- For ex, we didnt know about traps in DD (Darkest Dungeon) and took deadly stress damage\n\t\t- but thats fine as long as we used that opportunity to learn about traps and how to avoid in future\n- Use experiences/problems/obstacles to acquire information\n\t- For ex, every run is an experience and helps us learn more\n- Write these things down\n---\n### Plateup and life\n- If something makes you uncomfortable (because you don't understand it or are not good at it)\n\t- and if that thing is good for you, its result benefits you\n\t\t- then embrace it and do more of that thing\n- This is the only way to get used to that uncomfortable feeling and learn more about it\n\t- and eventually it becomes less uncomfortable\n- For ex, in Plateup, we see a recipe we didn't understand before and we chose it so that we could embrace it and understand it\n\t- the game ended because I didn't know it but I was able to learn from it and next time I did better\n---\n- the real reason why the tasks of life evoke fear and anxiety is because I give utmost importance to just the outcome/result of the task\n\t- I need to focus on the doing part, the journey. To take control of my fate, to be in control\n\t- focusing on what I CAN DO, leads to excitement\n\t\t- it becomes opportunities to solve problems/ challenges/ obstacles\n\t\t\t- solving them leads to contentment and fulfillment and happiness\n---\n\n### Skyrim/Valorant and Life obstacles\n- When I was challenged to a fist fight by Uthgerd The Unbroken, it was not an easy fight considering her armour is much better than mine, the combat in the game is bad and its a fist fight (no weapon, magic or healing)\n- I lost multiple times and yet I didnt give up, I kept going back to it and wanted to beat her/ overcome the obstacle\n- similarly in life, similar problems will keep arising and the point is to not give up after the first failure but to learn from it and keep trying to overcome the obstacle\n- for ex, when applying for jobs, we fail an interview, instead of giving up we learn from it and keep trying to pass the interviews\n- the same thing happens in valorant too, where I keep trying to get better at aiming and getting headshots even though I have bad ping and am at a disadvantage\n\t- I drop off sometimes but always come back in order to try to best the obstacle and eventually I keep improving\n\n---\n\n### Spelunky\n- the game is a corollary for life. Sometimes things dont go our way like the shopkeeper is enraged due to something we didn't do, the key and chest and in locations unreachable without prior knowledge, its a dark level and rushing water and black market entrance level so we miss the black market. But thats fine. I dont give up the run immediately but rather take it as a challenge to see how far can I go with this hand I've been dealt.\n- The main reason I give up is because I think \"since I dont have the conditions I think I needed to succeed, there's no point pursuing things further. Whats the point of trying if I know I cant succeed\". Many problems with this way of thinking. \n\t- First, I make up the conditions to succeed. I dont know for sure if those are actually the conditions to succeed\n\t- Second, I dont know for sure that if those conditions to succeed arent available then I wont be able succeed for sure. It might be harder to succeed maybe but not impossible. Its impossible if I give up.\n\t\t- subpoint - even if I have all the conditions to succeed available, I might not be able to succeed. Eg, I have a jetpack and a shotgun and good amount of health, I die and the run ends\n\t- Third, if I try to succeed ie, work with what I have, there's always a chance to succeed. The chance may be low but its not 0. And even 1% is more than 0%. (this is related to the previous point)\n\t\t- eg, I lost 1 hp due to some pointless mistake. I immediately thought of abandoning the run. I remember all this that I've written here and kept progressing and found a jetpack and compass in the market in the next level. \n\t- Fourth, the point isnt always about succeeding but even failing is fine because I can learn from it and try again and try again and again and again. As long as I'm learning its good. And failure isnt the end of it all. Life goes on and opportunities come again. Learning from failure and improving makes me capable of being able to capitalize on those opportunities better.\n- I do think this mental distortion might have something to do with my ADHD. If things dont go properly/successfully (properly/successfully as defined by me or society) right from the very beginning ie, each small step needs to be perfect and successful, then I give up. I think its my ADHD kicking in which wants rewards in the short term ie, each small step going properly rather than considering things in the long term.\n- `important` - I beat spelunky finally. And prior to beating it today, I had a run where I did everything perfectly but some random explosion killed me. Another one - The shopkeeper got angered due to a snail spawning in the shop in the black market and I made no mistake in that run. Another one - There was a bug and I got stuck exiting and got killed by a tiki trap. But I didnt give up. I immediately did a quick restart. So why can I keep restarting and not give up in games but not do so in real life? All the times I got screwed in Spelunky where it wasnt my mistake has happened and will happen in life as well. I could do everything perfectly, be very good, follow all the rules and life will fuck me over. Life is procedurally generated just like Spelunky. But I do a quick restart and immediately try again because if I do so then eventually I will win the game just like how I eventually beat spelunky. ^f7912e\n- Life is hard just like Spelunky. Then does that mean I cant beat it? Nope, I learn and keep trying and keep getting better and eventually everything will align properly for me. Luck will be on my side and I will win at life.\n---\n\n",
+ "frontmatter": {},
+ "headings": [
+ {"level": 3, "title": "Roguelikes and Life"},
+ {"level": 3, "title": "Plateup and life"},
+ {"level": 3, "title": "Skyrim/Valorant and Life obstacles"},
+ {"level": 3, "title": "Spelunky"},
+ ],
+ "tags": [],
+ "wikilinks": [],
+ "word_count": 1233,
+ },
+ ]
+ processed_data_path = tmp_path / "obsidian_index.json"
+ with processed_data_path.open("w", encoding="utf-8") as f:
+ json.dump(dummy_data, f, indent=2, ensure_ascii=False)
+ return processed_data_path
+
+
+@pytest.fixture
+def dummy_chunk_data_path(tmp_path: Path) -> Path:
+ """
+ Create a dummy chunk data file for testing.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+
+ Returns:
+ Path: The path to the created dummy chunk data file.
+ """
+ dummy_chunk_data = [
+ {
+ "chunk_id": "test Note.md::test_heading::chunk_0",
+ "note_id": "test Note.md",
+ "title": "Test Note",
+ "relative_path": "test_note.md",
+ "heading": "Test Heading",
+ "chunk_index": 0,
+ "text": "This is a test chunk.",
+ "word_count": 5,
+ "tags": [],
+ "frontmatter": {},
+ }
+ ]
+ chunk_data_path = tmp_path / "chunked_data.json"
+ with chunk_data_path.open("w", encoding="utf-8") as f:
+ json.dump(dummy_chunk_data, f, indent=2, ensure_ascii=False)
+ return chunk_data_path
+
+
+@pytest.fixture
+def dummy_embedded_chunk_data_path(tmp_path: Path) -> Path:
+ """
+ Create a dummy embedded chunk data file for testing.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+
+ Returns:
+ Path: The path to the created dummy embedded chunk data file.
+ """
+ dummy_chunk_data = [
+ {
+ "chunk_id": "test Note.md::test_heading::chunk_0",
+ "note_id": "test Note.md",
+ "title": "Test Note",
+ "relative_path": "test_note.md",
+ "heading": "Test Heading",
+ "chunk_index": 0,
+ "text": "This is a test chunk.",
+ "word_count": 5,
+ "tags": [],
+ "frontmatter": {},
+ "embedding": [1.2, 2.4, 4.9],
+ }
+ ]
+ embedded_chunk_data_path = tmp_path / "embedded_chunks.json"
+ with embedded_chunk_data_path.open("w", encoding="utf-8") as f:
+ json.dump(dummy_chunk_data, f, indent=2, ensure_ascii=False)
+ return embedded_chunk_data_path
+
+
+@pytest.fixture
+def dummy_encoder_config(tmp_path: Path) -> EncoderConfig:
+ """
+ Create dummy encoder configuration data for testing.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+
+ Returns:
+ EncoderConfig: Instance of the encoder configuration dataclass.
+ """
+ config_data = {
+ "model_name": "all-MiniLM-L6-v2",
+ "batch_size": 32,
+ "normalize_embeddings": True,
+ "device": "cuda",
+ }
+ config_path = tmp_path / "encoder_config.yaml"
+ with config_path.open("w", encoding="utf-8") as f:
+ yaml.dump(config_data, f)
+
+ # Load the configuration
+ loaded_config = load_encoder_config(config_path)
+ return loaded_config
+
+
+@pytest.fixture
+def dummy_encoder_config_path(tmp_path: Path) -> Path:
+ """
+ Create a dummy encoder configuration file for testing.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+
+ Returns:
+ Path: The path to the created dummy encoder configuration file.
+ """
+ config_data = {
+ "model_name": "all-MiniLM-L6-v2",
+ "batch_size": 32,
+ "normalize_embeddings": True,
+ "device": "cuda",
+ }
+ config_path = tmp_path / "encoder_config.yaml"
+ with config_path.open("w", encoding="utf-8") as f:
+ yaml.dump(config_data, f)
+
+ return config_path
diff --git a/tests/unittests/scripts/test_config.py b/tests/unittests/scripts/test_config.py
new file mode 100644
index 0000000..0a974e2
--- /dev/null
+++ b/tests/unittests/scripts/test_config.py
@@ -0,0 +1,76 @@
+import pytest
+import yaml
+from pathlib import Path
+
+from atlas.core.embedder.config import load_encoder_config, EncoderConfig
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load_encoder_config_positive(tmp_path: Path):
+ """
+ Test loading the encoder configuration file when file is available and not empty.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+ """
+
+ # Create a temporary YAML config file
+ config_data = {
+ "model_name": "all-MiniLM-L6-v2",
+ "batch_size": 32,
+ "normalize_embeddings": True,
+ "device": "cuda",
+ }
+ config_path = tmp_path / "encoder_config.yaml"
+ with config_path.open("w", encoding="utf-8") as f:
+ yaml.dump(config_data, f)
+
+ # Load the configuration
+ loaded_config = load_encoder_config(config_path)
+
+ # Assert that the loaded configuration matches the original data
+ assert isinstance(loaded_config, EncoderConfig)
+ assert loaded_config.model_name == config_data["model_name"]
+ assert loaded_config.batch_size == config_data["batch_size"]
+ assert loaded_config.normalize_embeddings == config_data["normalize_embeddings"]
+ assert loaded_config.device == config_data["device"]
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load_encoder_config_negative_empty_file(tmp_path):
+ """
+ Test loading the encoder configuration file when file is empty.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+ """
+ # Create an empty temporary YAML config file
+ config_path = tmp_path / "empty_encoder_config.yaml"
+ with config_path.open("w", encoding="utf-8") as f:
+ f.write("")
+
+ # Attempt to load the configuration and expect a ValueError
+ with pytest.raises(ValueError) as exc_info:
+ load_encoder_config(config_path)
+
+ assert "Encoder configuration file is empty" in str(exc_info.value)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load_encoder_config_negative_file_not_found(tmp_path):
+ """
+ Test loading the encoder configuration file when file is not found.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+ """
+ config_path = tmp_path / "incorrect_config_file.yaml"
+
+ # Attempt to load the configuration and expect a FileNotFoundError
+ with pytest.raises(FileNotFoundError) as exc_info:
+ load_encoder_config(config_path)
+
+ assert "Encoder configuration file not found" in str(exc_info.value)
diff --git a/tests/unittests/scripts/test_embedder_utils.py b/tests/unittests/scripts/test_embedder_utils.py
new file mode 100644
index 0000000..dde17c4
--- /dev/null
+++ b/tests/unittests/scripts/test_embedder_utils.py
@@ -0,0 +1,52 @@
+import pytest
+from pathlib import Path
+
+from atlas.utils.embedder_utils import load_embedded_chunks, generate_embedding
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load_embedded_chunks_positive(dummy_embedded_chunk_data_path: Path) -> None:
+ """
+ Test if embedded chunks data can be successfully loaded from json file.
+
+ Args:
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+
+ metadata = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ assert len(metadata) == 1
+ assert metadata[0]["chunk_id"] == "test Note.md::test_heading::chunk_0"
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load_embedded_chunks_negative(tmp_path: Path) -> None:
+ """
+ Test if exception is raised if the embedded chunks json file isnt available.
+
+ Args:
+ tmp_path (Path): Temporary path provided by pytest.
+ """
+
+ dummy_embedded_chunk_data_path = (
+ tmp_path / "embedded_chunks.json"
+ ) # file on this path doesnt exist
+ with pytest.raises(Exception) as exc_info:
+ _ = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ assert "Error loading embedded chunks json file" in str(exc_info.value)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_generate_embedding(dummy_encoder_config_path: Path) -> None:
+ """
+ Test encoder model generates proper embedding for a given text.
+
+ Args:
+ dummy_encoder_config_path (Path): The path to the dummy encoder configuration file.
+ """
+
+ text = "Hi, my name is Bob Ross!"
+ embedding = generate_embedding(text, str(dummy_encoder_config_path))
+ assert embedding.shape == (384,)
diff --git a/tests/unittests/scripts/test_faiss_vector_store.py b/tests/unittests/scripts/test_faiss_vector_store.py
new file mode 100644
index 0000000..fdffc0b
--- /dev/null
+++ b/tests/unittests/scripts/test_faiss_vector_store.py
@@ -0,0 +1,193 @@
+import json
+import pytest
+import numpy as np
+from pathlib import Path
+import faiss
+
+from atlas.core.indexer.faiss_vector_store import FaissVectorStore
+from atlas.utils.embedder_utils import load_embedded_chunks
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_add_positive(dummy_embedded_chunk_data_path: Path) -> None:
+ """
+ Test if embeddings/vectors can be added to the FAISS vector store.
+
+ Args:
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+ vectors = np.array([[1, 2, 3]])
+ embedded_chunks = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ store = FaissVectorStore(dim=3)
+ store.add(vectors, embedded_chunks)
+ assert store.index.ntotal == len(vectors)
+ assert len(store.metadata) == len(embedded_chunks)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_add_negative_invalid_embedding_shape(
+ dummy_embedded_chunk_data_path: Path,
+) -> None:
+ """
+ Test if exception is raised if the embedding(s) to be added to the FAISS vector
+ store has incorrect shape.
+
+ Args:
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+ vectors = np.array([1, 2])
+ embedded_chunks = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ store = FaissVectorStore(dim=3)
+ with pytest.raises(ValueError) as exc_info:
+ store.add(vectors, embedded_chunks)
+ assert "Invalid vector shape. Expected num of dim = 2 and size of vector" in str(
+ exc_info.value
+ )
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_add_negative_length_mismatch(dummy_embedded_chunk_data_path: Path) -> None:
+ """
+ Test if exception is raised if the length of the embeddings/vectors and
+ associated metadata does not match when adding embedding(s) to the FAISS
+ vector store.
+
+ Args:
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+ vectors = np.array([[1, 2, 3], [2, 3, 4]])
+ embedded_chunks = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ store = FaissVectorStore(dim=3)
+ with pytest.raises(ValueError) as exc_info:
+ store.add(vectors, embedded_chunks)
+ assert "Vectors and metadata length mismatch" in str(exc_info.value)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_search_positive(dummy_embedded_chunk_data_path: Path) -> None:
+ """
+ Test if the FAISS store can successfully search for neighbor embeddings for a
+ provided input embedding.
+
+ Args:
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+ vectors = np.array([[1, 2, 3]])
+ embedded_chunks = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ store = FaissVectorStore(dim=3)
+ store.add(vectors, embedded_chunks)
+ query_vector = np.array([1, 2, 2])
+ k = 1
+ results = store.search(query_vector, k)
+ assert len(results) == k
+ assert "score" in results[0].keys()
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_search_negative_large_k(dummy_embedded_chunk_data_path: Path) -> None:
+ """
+ Test if exeception is raised if the number of neighbors ie, `k` to be searched
+ in the vector store for an input embedding is more than the maximum embeddings
+ saved in the vector store.
+
+ Args:
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+ vectors = np.array([[1, 2, 3]])
+ embedded_chunks = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ store = FaissVectorStore(dim=3)
+ store.add(vectors, embedded_chunks)
+ query_vector = np.array([[1, 2, 2]])
+ k = 3
+ with pytest.raises(Exception) as exc_info:
+ _ = store.search(query_vector, k)
+ assert "k is more than maximum possible value" in str(exc_info.value)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_save(tmp_path: Path, dummy_embedded_chunk_data_path: Path) -> None:
+ """
+ Test if generated index and metadata files can be saved.
+
+ Args:
+ tmp_path (Path): Temporary path provided by pytest.
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+ vectors = np.array([[1, 2, 3]])
+ embedded_chunks = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ store = FaissVectorStore(dim=3)
+ store.add(vectors, embedded_chunks)
+ results_save_path = tmp_path / "Results"
+ store.save(str(results_save_path))
+ index_file_path = results_save_path / "index.faiss"
+ metadata_file_path = results_save_path / "metadata.json"
+
+ assert index_file_path.exists()
+ index_data = faiss.read_index(str(index_file_path))
+ assert index_data.ntotal == len(vectors)
+
+ assert metadata_file_path.exists()
+ with metadata_file_path.open("r", encoding="utf-8") as f:
+ metadata_data = json.load(f)
+ assert len(metadata_data) == len(embedded_chunks)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load_positive(tmp_path: Path, dummy_embedded_chunk_data_path: Path) -> None:
+ """
+ Test if saved index and metadata files can be loaded.
+
+ Args:
+ tmp_path (Path): Temporary path provided by pytest.
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+ vectors = np.array([[1, 2, 3]])
+ embedded_chunks = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ store = FaissVectorStore(dim=3)
+ store.add(vectors, embedded_chunks)
+ results_save_path = tmp_path / "Results"
+ store.save(str(results_save_path))
+ index_file_path = results_save_path / "index.faiss"
+ metadata_file_path = results_save_path / "metadata.json"
+ store.load(results_load_path=str(results_save_path))
+
+ assert index_file_path.exists()
+ index_data = faiss.read_index(str(index_file_path))
+ assert index_data.ntotal == len(vectors)
+
+ assert metadata_file_path.exists()
+ with metadata_file_path.open("r", encoding="utf-8") as f:
+ metadata_data = json.load(f)
+ assert len(metadata_data) == len(embedded_chunks)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load_negative_file_not_found(
+ tmp_path: Path, dummy_embedded_chunk_data_path: Path
+) -> None:
+ """
+ Test if exception is raised if the index or metadata file is not found when trying to load them.
+
+ Args:
+ tmp_path (Path): Temporary path provided by pytest.
+ dummy_embedded_chunk_data_path (Path): The path to the dummy embedded chunks json file.
+ """
+ vectors = np.array([[1, 2, 3]])
+ embedded_chunks = load_embedded_chunks(str(dummy_embedded_chunk_data_path))
+ store = FaissVectorStore(dim=3)
+ store.add(vectors, embedded_chunks)
+ results_save_path = tmp_path / "Results"
+ store.save(str(results_save_path))
+ results_load_path = tmp_path / "Res" # folder doesnt exist
+
+ with pytest.raises(Exception) as exc_info:
+ store.load(results_load_path=str(results_load_path))
+ assert "Error reading either index file or metadata file" in str(exc_info.value)
diff --git a/tests/unittests/scripts/test_obsidian_vault_processor.py b/tests/unittests/scripts/test_obsidian_vault_processor.py
index f9ef912..69c200f 100644
--- a/tests/unittests/scripts/test_obsidian_vault_processor.py
+++ b/tests/unittests/scripts/test_obsidian_vault_processor.py
@@ -3,7 +3,7 @@
import json
from pathlib import Path
-from atlas.core.ingest.obsidian_vault_processor import ObsidianVaultProcessor
+from atlas.core.ingester.obsidian_vault_processor import ObsidianVaultProcessor
@pytest.fixture
diff --git a/tests/unittests/scripts/test_sentence_transformer_embedder.py b/tests/unittests/scripts/test_sentence_transformer_embedder.py
new file mode 100644
index 0000000..4b7b5c4
--- /dev/null
+++ b/tests/unittests/scripts/test_sentence_transformer_embedder.py
@@ -0,0 +1,213 @@
+import pytest
+import json
+from pathlib import Path
+from typing import List, Dict
+
+from atlas.core.embedder.sentence_transformer.impl_embedder import (
+ SentenceTransformerEmbedder,
+)
+from atlas.core.embedder.sentence_transformer.impl_encoder import (
+ SentenceTransformerEncoder,
+)
+
+
+@pytest.fixture
+def dummy_chunks() -> List[Dict]:
+ """
+ Create a dummy chunks for testing.
+
+ Returns:
+ List[Dict]: The dummy chunk data.
+ """
+ dummy_chunk_data = [
+ {
+ "chunk_id": "test Note.md::test_heading::chunk_0",
+ "note_id": "test Note.md",
+ "title": "Test Note",
+ "relative_path": "test_note.md",
+ "heading": "Test Heading",
+ "chunk_index": 0,
+ "text": "This is a test chunk.",
+ "word_count": 5,
+ "tags": [],
+ "frontmatter": {},
+ },
+ {
+ "chunk_id": "test Note 2.md::test_heading::chunk_0",
+ "note_id": "test Note 2.md",
+ "title": "Test Note 2",
+ "relative_path": "test_note 2.md",
+ "heading": "Test Heading 2",
+ "chunk_index": 0,
+ "text": "This is a test chunk 2.",
+ "word_count": 6,
+ "tags": [],
+ "frontmatter": {},
+ },
+ ]
+ return dummy_chunk_data
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load_encoder(dummy_encoder_config_path: Path):
+ """
+ Test SentenceTransformerEmbedder's encoder loading functionality.
+
+ Args:
+ dummy_encoder_config_path (Path): The path to the dummy encoder configuration file.
+ """
+ # Encoder is loaded when `SentenceTransformerEmbedder` is initialized. No need to call separately
+ embedder = SentenceTransformerEmbedder(
+ "dummy_chunked_data.json", "dummy.json", str(dummy_encoder_config_path)
+ )
+ assert embedder.encoder is not None
+ assert isinstance(embedder.encoder, SentenceTransformerEncoder)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_embed_chunks_positive(
+ dummy_encoder_config_path: Path, dummy_chunks: List[Dict]
+):
+ """
+ Test SentenceTransformerEmbedder's functionality to generate embeddings for provided chunks.
+
+ Args:
+ dummy_encoder_config_path (Path): The path to the dummy encoder configuration file.
+ dummy_chunks (List[Dict]): The provided dummy chunks.
+ """
+ embedder = SentenceTransformerEmbedder(
+ "dummy_chunked_data.json", "dummy.json", str(dummy_encoder_config_path)
+ )
+ embedded_chunks = embedder.embed_chunks(dummy_chunks)
+ assert len(embedded_chunks) == len(dummy_chunks)
+ assert "embedding" in embedded_chunks[0].keys()
+ assert len(embedded_chunks[0]["embedding"]) == 384
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_embed_chunks_negative(dummy_encoder_config_path: Path):
+ """
+ Test SentenceTransformerEmbedder's functionality to fail in generating embeddings for
+ provided chunks when no chunk data is passed to `SentenceTransformerEmbedder.embed_chunks()`.
+
+ Args:
+ dummy_encoder_config_path (Path): The path to the dummy encoder configuration file.
+ """
+ embedder = SentenceTransformerEmbedder(
+ "dummy_chunked_data.json", "dummy.json", str(dummy_encoder_config_path)
+ )
+ embedded_chunks = embedder.embed_chunks([])
+ assert len(embedded_chunks) == 0
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_read_chunk_data_positive(
+ dummy_encoder_config_path: Path, dummy_chunk_data_path: Path
+):
+ """
+ Test SentenceTransformerEmbedder's functionality to read chunk data from the provided chunk data
+ path.
+
+ Args:
+ dummy_encoder_config_path (Path): The path to the dummy encoder configuration file.
+ dummy_chunk_data_path (Path): The path to the provided chunk data path.
+ """
+ embedder = SentenceTransformerEmbedder(
+ str(dummy_chunk_data_path), "dummy.json", str(dummy_encoder_config_path)
+ )
+ chunks = embedder.read_chunk_data()
+ assert chunks is not None
+ assert len(chunks) == 1
+ assert isinstance(chunks[0], dict)
+ assert chunks[0]["chunk_id"] == "test Note.md::test_heading::chunk_0"
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_read_chunk_data_negative(
+ dummy_encoder_config_path: Path, dummy_chunk_data_path: Path
+):
+ """
+ Test SentenceTransformerEmbedder's failure to read chunk data from the provided chunk data
+ path when the provided chunk data path doesnt exist.
+
+ Args:
+ dummy_encoder_config_path (Path): The path to the dummy encoder configuration file.
+ dummy_chunk_data_path (Path): The path to the provided chunk data path.
+ """
+ dummy_chunk_data_path = (
+ dummy_chunk_data_path.parent / "dummy_chunked_data.json"
+ ) # doesnt exist
+ embedder = SentenceTransformerEmbedder(
+ str(dummy_chunk_data_path), "dummy.json", str(dummy_encoder_config_path)
+ )
+ chunks = embedder.read_chunk_data()
+ assert chunks is None
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_save_embedded_chunks(tmp_path: Path, dummy_encoder_config_path: Path):
+ """
+ Test SentenceTransformerEmbedder's functionality to save the generated embedded chunk data.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+ dummy_encoder_config_path (Path): The path to the dummy encoder configuration file.
+ """
+ output_file_path = tmp_path / "embedded_chunks.json"
+ dummy_embedded_chunks = [
+ {
+ "chunk_id": "test Note.md::test_heading::chunk_0",
+ "note_id": "test Note.md",
+ "title": "Test Note",
+ "relative_path": "test_note.md",
+ "heading": "Test Heading",
+ "chunk_index": 0,
+ "text": "This is a test chunk.",
+ "word_count": 5,
+ "tags": [],
+ "frontmatter": {},
+ "embedding": [float(i) for i in range(384)],
+ }
+ ]
+ embedder = SentenceTransformerEmbedder(
+ "dummy_chunked_data.json", str(output_file_path), str(dummy_encoder_config_path)
+ )
+ embedder.save_embedded_chunks(dummy_embedded_chunks)
+ assert output_file_path.exists()
+ with output_file_path.open("r", encoding="utf-8") as f:
+ saved_data = json.load(f)
+ assert saved_data == dummy_embedded_chunks
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_embed(
+ tmp_path: Path, dummy_chunk_data_path: Path, dummy_encoder_config_path: Path
+):
+ """
+ Test SentenceTransformerEmbedder's overall functionality to generate the embedded chunk data
+ from the chunks in the provided chunk data path.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+ dummy_chunk_data_path (Path): The path to the provided chunk data path.
+ dummy_encoder_config_path (Path): The path to the dummy encoder configuration file.
+ """
+ output_file_path = tmp_path / "embedded_chunks.json"
+ embedder = SentenceTransformerEmbedder(
+ str(dummy_chunk_data_path),
+ str(output_file_path),
+ str(dummy_encoder_config_path),
+ )
+ embedder.embed()
+ assert output_file_path.exists()
+ with output_file_path.open("r", encoding="utf-8") as f:
+ saved_data = json.load(f)
+
+ assert len(saved_data[0]["embedding"]) == 384
diff --git a/tests/unittests/scripts/test_sentence_transformer_encoder.py b/tests/unittests/scripts/test_sentence_transformer_encoder.py
new file mode 100644
index 0000000..0a4818a
--- /dev/null
+++ b/tests/unittests/scripts/test_sentence_transformer_encoder.py
@@ -0,0 +1,37 @@
+import pytest
+
+from atlas.core.embedder.config import EncoderConfig
+from atlas.core.embedder.sentence_transformer.impl_encoder import (
+ SentenceTransformerEncoder,
+)
+from sentence_transformers import SentenceTransformer
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_load(dummy_encoder_config: EncoderConfig):
+ """
+ Test the model loading functionality of the SentenceTransformerEncoder wrapper.
+
+ Args:
+ dummy_encoder_config (EncoderConfig): Loaded encoder configuration data.
+ """
+ encoder = SentenceTransformerEncoder(dummy_encoder_config)
+ assert encoder.model is not None
+ assert isinstance(encoder.model, SentenceTransformer)
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_encode(dummy_encoder_config: EncoderConfig):
+ """
+ Test the loaded encoder model's encoding functionality of the SentenceTransformerEncoder wrapper.
+
+ Args:
+ dummy_encoder_config (EncoderConfig): Loaded encoder configuration data.
+ """
+ encoder = SentenceTransformerEncoder(dummy_encoder_config)
+ encoder.load()
+ texts = ["lorem ipsum", "do re mi fa so la ti", "hello world"]
+ embeddings = encoder.encode(texts)
+ assert embeddings.shape == (3, 384)
diff --git a/tests/unittests/scripts/test_structural_chunker.py b/tests/unittests/scripts/test_structural_chunker.py
index b6f397e..e0708a3 100644
--- a/tests/unittests/scripts/test_structural_chunker.py
+++ b/tests/unittests/scripts/test_structural_chunker.py
@@ -5,52 +5,6 @@
from atlas.core.chunker.structural_chunker import StructuralChunker
-@pytest.fixture
-def dummy_processed_data_path(tmp_path: Path) -> Path:
- """
- Create a dummy processed data file for testing.
-
- Args:
- tmp_path (Path): Temporary directory provided by pytest.
-
- Returns:
- Path: The path to the created dummy processed data file.
- """
- dummy_data = [
- {
- "note_id": "_learning about me/what I learnt about myself when dealing with ADHD.md",
- "title": "what I learnt about myself when dealing with ADHD",
- "relative_path": "_learning about me/what I learnt about myself when dealing with ADHD.md",
- "raw_text": "\n- aim to do less, end up doing more\n- breakdown tasks into initial granular nested problems\n- work in iterations. Get 80% value from 20% of things, Pareto's principle\n- do long things, delay instant gratification. Avoid seeking novelty. Finish reading one book, watch one big video, play one game.\n- morning and night routines\n- just open things up, set things up, write in notepad ++ then take a break. Dont have to start immediately\n- do a little everyday in a robust consistent manner. Dont have to do it all in one day, in this one session. Takes time. Be patient but consistent\n- systems thinker - seeing the whole picture and its parts as early as possible - related to working in iterations",
- "frontmatter": {},
- "headings": [],
- "tags": [],
- "wikilinks": [],
- "word_count": 127,
- },
- {
- "note_id": "_Meditations/Games and Life.md",
- "title": "Games and Life",
- "relative_path": "_Meditations/Games and Life.md",
- "raw_text": "\n### Roguelikes and Life\n- **Learning** more about the thing that's causing us _problems/scaring us/making us uncomfortable_ will help us deal with the problem better\n- For example, in Darkest Dungeon, stress is a huge problem for me but as I learn more about it, the fear changes into enthusiasm because I understand things better\n\t- So we try to apply the same in life as well\n---\n- Frame _questions_ regarding the problems Im currently facing\n\t- and _answer_ them\n\t\t- if you dont know the answer, search online or ask someone\n---\n- Learn from mistakes in life\n\t- For ex, we didnt know about traps in DD (Darkest Dungeon) and took deadly stress damage\n\t\t- but thats fine as long as we used that opportunity to learn about traps and how to avoid in future\n- Use experiences/problems/obstacles to acquire information\n\t- For ex, every run is an experience and helps us learn more\n- Write these things down\n---\n### Plateup and life\n- If something makes you uncomfortable (because you don't understand it or are not good at it)\n\t- and if that thing is good for you, its result benefits you\n\t\t- then embrace it and do more of that thing\n- This is the only way to get used to that uncomfortable feeling and learn more about it\n\t- and eventually it becomes less uncomfortable\n- For ex, in Plateup, we see a recipe we didn't understand before and we chose it so that we could embrace it and understand it\n\t- the game ended because I didn't know it but I was able to learn from it and next time I did better\n---\n- the real reason why the tasks of life evoke fear and anxiety is because I give utmost importance to just the outcome/result of the task\n\t- I need to focus on the doing part, the journey. To take control of my fate, to be in control\n\t- focusing on what I CAN DO, leads to excitement\n\t\t- it becomes opportunities to solve problems/ challenges/ obstacles\n\t\t\t- solving them leads to contentment and fulfillment and happiness\n---\n\n### Skyrim/Valorant and Life obstacles\n- When I was challenged to a fist fight by Uthgerd The Unbroken, it was not an easy fight considering her armour is much better than mine, the combat in the game is bad and its a fist fight (no weapon, magic or healing)\n- I lost multiple times and yet I didnt give up, I kept going back to it and wanted to beat her/ overcome the obstacle\n- similarly in life, similar problems will keep arising and the point is to not give up after the first failure but to learn from it and keep trying to overcome the obstacle\n- for ex, when applying for jobs, we fail an interview, instead of giving up we learn from it and keep trying to pass the interviews\n- the same thing happens in valorant too, where I keep trying to get better at aiming and getting headshots even though I have bad ping and am at a disadvantage\n\t- I drop off sometimes but always come back in order to try to best the obstacle and eventually I keep improving\n\n---\n\n### Spelunky\n- the game is a corollary for life. Sometimes things dont go our way like the shopkeeper is enraged due to something we didn't do, the key and chest and in locations unreachable without prior knowledge, its a dark level and rushing water and black market entrance level so we miss the black market. But thats fine. I dont give up the run immediately but rather take it as a challenge to see how far can I go with this hand I've been dealt.\n- The main reason I give up is because I think \"since I dont have the conditions I think I needed to succeed, there's no point pursuing things further. Whats the point of trying if I know I cant succeed\". Many problems with this way of thinking. \n\t- First, I make up the conditions to succeed. I dont know for sure if those are actually the conditions to succeed\n\t- Second, I dont know for sure that if those conditions to succeed arent available then I wont be able succeed for sure. It might be harder to succeed maybe but not impossible. Its impossible if I give up.\n\t\t- subpoint - even if I have all the conditions to succeed available, I might not be able to succeed. Eg, I have a jetpack and a shotgun and good amount of health, I die and the run ends\n\t- Third, if I try to succeed ie, work with what I have, there's always a chance to succeed. The chance may be low but its not 0. And even 1% is more than 0%. (this is related to the previous point)\n\t\t- eg, I lost 1 hp due to some pointless mistake. I immediately thought of abandoning the run. I remember all this that I've written here and kept progressing and found a jetpack and compass in the market in the next level. \n\t- Fourth, the point isnt always about succeeding but even failing is fine because I can learn from it and try again and try again and again and again. As long as I'm learning its good. And failure isnt the end of it all. Life goes on and opportunities come again. Learning from failure and improving makes me capable of being able to capitalize on those opportunities better.\n- I do think this mental distortion might have something to do with my ADHD. If things dont go properly/successfully (properly/successfully as defined by me or society) right from the very beginning ie, each small step needs to be perfect and successful, then I give up. I think its my ADHD kicking in which wants rewards in the short term ie, each small step going properly rather than considering things in the long term.\n- `important` - I beat spelunky finally. And prior to beating it today, I had a run where I did everything perfectly but some random explosion killed me. Another one - The shopkeeper got angered due to a snail spawning in the shop in the black market and I made no mistake in that run. Another one - There was a bug and I got stuck exiting and got killed by a tiki trap. But I didnt give up. I immediately did a quick restart. So why can I keep restarting and not give up in games but not do so in real life? All the times I got screwed in Spelunky where it wasnt my mistake has happened and will happen in life as well. I could do everything perfectly, be very good, follow all the rules and life will fuck me over. Life is procedurally generated just like Spelunky. But I do a quick restart and immediately try again because if I do so then eventually I will win the game just like how I eventually beat spelunky. ^f7912e\n- Life is hard just like Spelunky. Then does that mean I cant beat it? Nope, I learn and keep trying and keep getting better and eventually everything will align properly for me. Luck will be on my side and I will win at life.\n---\n\n",
- "frontmatter": {},
- "headings": [
- {"level": 3, "title": "Roguelikes and Life"},
- {"level": 3, "title": "Plateup and life"},
- {"level": 3, "title": "Skyrim/Valorant and Life obstacles"},
- {"level": 3, "title": "Spelunky"},
- ],
- "tags": [],
- "wikilinks": [],
- "word_count": 1233,
- },
- ]
- processed_data_path = tmp_path / "obsidian_index.json"
- with processed_data_path.open("w", encoding="utf-8") as f:
- json.dump(dummy_data, f, indent=2, ensure_ascii=False)
- return processed_data_path
-
-
@pytest.mark.unittest
@pytest.mark.runonci
def test_read_processed_data_positive(dummy_processed_data_path: Path):
@@ -235,3 +189,29 @@ def test_save_chunked_data(tmp_path: Path):
with output_file.open("r", encoding="utf-8") as f:
saved_data = json.load(f)
assert saved_data == chunked_data
+
+
+@pytest.mark.unittest
+@pytest.mark.runonci
+def test_chunk(tmp_path: Path, dummy_processed_data_path: Path):
+ """
+ Test the main chunking process of the chunker module.
+
+ Args:
+ tmp_path (Path): Temporary directory provided by pytest.
+ dummy_processed_data_path (Path): Path to the dummy processed data file.
+ """
+ chunker = StructuralChunker(
+ processed_data_path=str(dummy_processed_data_path),
+ output_path=str(tmp_path / "chunked_data.json"),
+ max_words=250,
+ )
+ chunker.chunk()
+ output_file = tmp_path / "chunked_data.json"
+ assert output_file.exists()
+ with output_file.open("r", encoding="utf-8") as f:
+ saved_data = json.load(f)
+ assert (
+ saved_data[0]["note_id"]
+ == "_learning about me/what I learnt about myself when dealing with ADHD.md"
+ )