From c7fd51dc6dafc49616d0442fcb835bcfc4612fe8 Mon Sep 17 00:00:00 2001 From: Tom Kerby <46465501+tjkerby@users.noreply.github.com> Date: Wed, 1 Apr 2026 07:46:50 -0600 Subject: [PATCH 1/4] Refactor bibliography entries for consistency Updated author formatting and access dates in bibliography entries. --- paper/paper.bib | 191 ++++++++++++++++++++++++------------------------ 1 file changed, 97 insertions(+), 94 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 3426d82..16ff192 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -3,15 +3,13 @@ ## `paper.bib` -```bibtex @misc{anthropic2024mcp, title = {Introducing the Model Context Protocol}, - author = {Anthropic}, + author = {{Anthropic}}, year = {2024}, month = {November}, url = {https://www.anthropic.com/news/model-context-protocol}, - howpublished = {\url{https://www.anthropic.com/news/model-context-protocol}}, - note = {Accessed: August 22, 2025} + note = {Accessed: 2025-08-22} } @misc{mcp2024github, @@ -19,8 +17,7 @@ @misc{mcp2024github author = {{Model Context Protocol Contributors}}, year = {2024}, url = {https://github.com/modelcontextprotocol}, - howpublished = {\url{https://github.com/modelcontextprotocol}}, - note = {Accessed: August 22, 2025} + note = {Accessed: 2025-08-22} } @misc{semanticscholar2024api, @@ -28,17 +25,17 @@ @misc{semanticscholar2024api author = {{Semantic Scholar}}, year = {2024}, url = {https://www.semanticscholar.org/product/api}, - howpublished = {\url{https://www.semanticscholar.org/product/api}}, - note = {Accessed: August 22, 2025} + note = {Accessed: 2025-08-22} } @article{Kinney2023TheSS, - title={The Semantic Scholar Open Data Platform}, - author={Rodney Michael Kinney and Chloe Anastasiades and Russell Authur and Iz Beltagy and Jonathan Bragg and Alexandra Buraczynski and Isabel Cachola and Stefan Candra and Yoganand Chandrasekhar and Arman Cohan and Miles Crawford and Doug Downey and Jason Dunkelberger and Oren Etzioni and Rob Evans and Sergey Feldman and Joseph Gorney and David W. Graham and F.Q. Hu and Regan Huff and Daniel King and Sebastian Kohlmeier and Bailey Kuehl and Michael Langan and Daniel Lin and Haokun Liu and Kyle Lo and Jaron Lochner and Kelsey MacMillan and Tyler C. Murray and Christopher Newell and Smita R Rao and Shaurya Rohatgi and Paul Sayre and Zejiang Shen and Amanpreet Singh and Luca Soldaini and Shivashankar Subramanian and A. Tanaka and Alex D Wade and Linda M. Wagner and Lucy Lu Wang and Christopher Wilhelm and Caroline Wu and Jiangjiang Yang and Angele Zamarron and Madeleine van Zuylen and Daniel S. Weld}, - journal={ArXiv}, - year={2023}, - volume={abs/2301.10140}, - url={https://api.semanticscholar.org/CorpusID:256194545} + title = {The Semantic Scholar Open Data Platform}, + author = {Kinney, Rodney Michael and Anastasiades, Chloe and Authur, Russell and Beltagy, Iz and Bragg, Jonathan and Buraczynski, Alexandra and Cachola, Isabel and Candra, Stefan and Chandrasekhar, Yoganand and Cohan, Arman and Crawford, Miles and Downey, Doug and Dunkelberger, Jason and Etzioni, Oren and Evans, Rob and Feldman, Sergey and Gorney, Joseph and Graham, David W. and Hu, F. Q. and Huff, Regan and King, Daniel and Kohlmeier, Sebastian and Kuehl, Bailey and Langan, Michael and Lin, Daniel and Liu, Haokun and Lo, Kyle and Lochner, Jaron and MacMillan, Kelsey and Murray, Tyler C. and Newell, Christopher and Rao, Smita R. and Rohatgi, Shaurya and Sayre, Paul and Shen, Zejiang and Singh, Amanpreet and Soldaini, Luca and Subramanian, Shivashankar and Tanaka, A. and Wade, Alex D. and Wagner, Linda M. and Wang, Lucy Lu and Wilhelm, Christopher and Wu, Caroline and Yang, Jiangjiang and Zamarron, Angele and van Zuylen, Madeleine and Weld, Daniel S.}, + journal = {arXiv}, + year = {2023}, + volume = {2301.10140}, + doi = {10.48550/arXiv.2301.10140}, + url = {https://arxiv.org/abs/2301.10140} } @misc{neo4j2024database, @@ -46,8 +43,7 @@ @misc{neo4j2024database author = {{Neo4j Inc.}}, year = {2024}, url = {https://neo4j.com/product/neo4j-graph-database/}, - howpublished = {\url{https://neo4j.com/product/neo4j-graph-database/}}, - note = {Accessed: August 22, 2025} + note = {Accessed: 2025-08-22} } @article{franz2023cytoscape, @@ -147,23 +143,23 @@ @inproceedings{wang-etal-2024-leave } @article{gao2023retrieval, - title={Retrieval-augmented generation for large language models: A survey}, - author={Gao, Yunfan and Xiong, Yun and Gao, Xinyu and Jia, Kangxiang and Pan, Jinliu and Bi, Yuxi and Dai, Yi and Sun, Jiawei and Wang, Haofen and Wang, Haofen}, - journal={arXiv preprint arXiv:2312.10997}, - volume={2}, - year={2023} + title = {Retrieval-augmented generation for large language models: A survey}, + author = {Gao, Yunfan and Xiong, Yun and Gao, Xinyu and Jia, Kangxiang and Pan, Jinliu and Bi, Yuxi and Dai, Yi and Sun, Jiawei and Wang, Haofen and Wang, Haofen}, + journal = {arXiv preprint arXiv:2312.10997}, + volume = {2}, + year = {2023} } @inproceedings{reimers-2019-sentence-bert, - title = {Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks}, - author = {Reimers, Nils and Gurevych, Iryna}, + title = {Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks}, + author = {Reimers, Nils and Gurevych, Iryna}, booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing}, - pages = {3973--3983}, - month = {November}, - year = {2019}, + pages = {3973--3983}, + month = {November}, + year = {2019}, publisher = {Association for Computational Linguistics}, - url = {http://arxiv.org/abs/1908.10084}, - doi = {10.18653/v1/d19-1410} + url = {http://arxiv.org/abs/1908.10084}, + doi = {10.18653/v1/D19-1410} } @article{reinanda2020knowledge, @@ -230,8 +226,7 @@ @misc{docker2024 author = {{Docker, Inc.}}, year = {2024}, url = {https://www.docker.com/}, - howpublished = {\url{https://www.docker.com/}}, - note = {Accessed: August 22, 2025} + note = {Accessed: 2025-08-22} } @misc{fastapi2024, @@ -239,8 +234,7 @@ @misc{fastapi2024 author = {{FastAPI Contributors}}, year = {2024}, url = {https://fastapi.tiangolo.com/}, - howpublished = {\url{https://fastapi.tiangolo.com/}}, - note = {Accessed: August 22, 2025} + note = {Accessed: 2025-08-22} } @misc{rabbitmq2024, @@ -248,8 +242,7 @@ @misc{rabbitmq2024 author = {{VMware, Inc.}}, year = {2024}, url = {https://www.rabbitmq.com/}, - howpublished = {\url{https://www.rabbitmq.com/}}, - note = {Accessed: August 22, 2025} + note = {Accessed: 2025-08-22} } @misc{ollama2024, @@ -257,25 +250,25 @@ @misc{ollama2024 author = {{Ollama Team}}, year = {2024}, url = {https://ollama.com/}, - howpublished = {\url{https://ollama.com/}}, - note = {Accessed: August 22, 2025} + note = {Accessed: 2025-08-22} } @misc{openai2023api, - title={OpenAI API}, - author={OpenAI}, - note={\url{https://platform.openai.com/}}, - year={2023} + title = {OpenAI API Documentation}, + author = {{OpenAI}}, + year = {2026}, + url = {https://developers.openai.com/api/docs}, + note = {Accessed: 2026-04-01} } @article{nussbaum2025nomic, - title={Nomic Embed: Training a Reproducible Long Context Text Embedder}, - author={Nussbaum, Zach and Morris, John Xavier and Mulyar, Andriy and Duderstadt, Brandon}, - journal={Transactions on Machine Learning Research}, - issn={2835-8856}, - year={2025}, - url={https://openreview.net/forum?id=IPmzyQSiQE}, - note={Reproducibility Certification} + title = {Nomic Embed: Training a Reproducible Long Context Text Embedder}, + author = {Nussbaum, Zach and Morris, John Xavier and Mulyar, Andriy and Duderstadt, Brandon}, + journal = {Transactions on Machine Learning Research}, + issn = {2835-8856}, + year = {2025}, + url = {https://openreview.net/forum?id=IPmzyQSiQE}, + note = {Reproducibility Certification} } @article{rothacher2023eleven, @@ -291,81 +284,91 @@ @article{rothacher2023eleven } @article{merkel2014docker, - title={Docker: lightweight linux containers for consistent development and deployment}, - author={Merkel, Dirk}, - journal={Linux journal}, - volume={2014}, - number={239}, - pages={2}, - year={2014} + title = {Docker: lightweight linux containers for consistent development and deployment}, + author = {Merkel, Dirk}, + journal = {Linux Journal}, + volume = {2014}, + number = {239}, + pages = {2}, + year = {2014} } @misc{langchain2026rag, - title = {Build a RAG agent with LangChain}, - author = {{LangChain}}, - howpublished = {\url{https://docs.langchain.com/oss/python/langchain/rag}}, - note = {Accessed: 2026-01-23} + title = {Build a RAG agent with LangChain}, + author = {{LangChain}}, + year = {2026}, + url = {https://docs.langchain.com/oss/python/langchain/rag}, + note = {Accessed: 2026-01-23} } @misc{llamaindex2026rag, - title = {Introduction to RAG (Retrieval-Augmented Generation)}, - author = {{LlamaIndex}}, - howpublished = {\url{https://developers.llamaindex.ai/python/framework/understanding/rag/}}, - note = {Accessed: 2026-01-23} + title = {Introduction to RAG (Retrieval-Augmented Generation)}, + author = {{LlamaIndex}}, + year = {2026}, + url = {https://developers.llamaindex.ai/python/framework/understanding/rag/}, + note = {Accessed: 2026-01-23} } @misc{haystack2026, - title = {Get Started (Haystack Documentation)}, - author = {{deepset Haystack}}, - howpublished = {\url{https://docs.haystack.deepset.ai/docs/get-started}}, - note = {Accessed: 2026-01-23} + title = {Get Started (Haystack Documentation)}, + author = {{deepset Haystack}}, + year = {2026}, + url = {https://docs.haystack.deepset.ai/docs/get-started}, + note = {Accessed: 2026-01-23} } @misc{llamaindex2026kg, - title = {Knowledge Graph Index (LlamaIndex Python Documentation)}, - author = {{LlamaIndex}}, - howpublished = {\url{https://developers.llamaindex.ai/python/examples/index_structs/knowledge_graph/knowledgegraphdemo/}}, - note = {Accessed: 2026-01-23} + title = {Knowledge Graph Index (LlamaIndex Python Documentation)}, + author = {{LlamaIndex}}, + year = {2026}, + url = {https://developers.llamaindex.ai/python/examples/index_structs/knowledge_graph/knowledgegraphdemo/}, + note = {Accessed: 2026-01-23} } @misc{langchain2026neo4j, - title = {Neo4j (LangChain Integration Documentation)}, - author = {{LangChain}}, - howpublished = {\url{https://docs.langchain.com/oss/python/integrations/providers/neo4j}}, - note = {Accessed: 2026-01-23} + title = {Neo4j (LangChain Integration Documentation)}, + author = {{LangChain}}, + year = {2026}, + url = {https://docs.langchain.com/oss/python/integrations/providers/neo4j}, + note = {Accessed: 2026-01-23} } @misc{microsoft2026graphrag, - title = {microsoft/graphrag: A modular graph-based Retrieval-Augmented Generation (RAG) system}, - author = {{Microsoft}}, - howpublished = {\url{https://github.com/microsoft/graphrag}}, - note = {Accessed: 2026-01-23} + title = {microsoft/graphrag: A modular graph-based Retrieval-Augmented Generation (RAG) system}, + author = {{Microsoft}}, + year = {2026}, + url = {https://github.com/microsoft/graphrag}, + note = {Accessed: 2026-01-23} } @misc{neo4j2026graphrag, - title = {neo4j/neo4j-graphrag-python: Neo4j GraphRAG Package for Python}, - author = {{Neo4j}}, - howpublished = {\url{https://github.com/neo4j/neo4j-graphrag-python}}, - note = {Accessed: 2026-01-23} + title = {neo4j/neo4j-graphrag-python: Neo4j GraphRAG Package for Python}, + author = {{Neo4j}}, + year = {2026}, + url = {https://github.com/neo4j/neo4j-graphrag-python}, + note = {Accessed: 2026-01-23} } @misc{connectedpapers2026, - title = {Connected Papers}, - author = {{Connected Papers}}, - howpublished = {\url{https://www.connectedpapers.com/}}, - note = {Accessed: 2026-01-23} + title = {Connected Papers}, + author = {{Connected Papers}}, + year = {2026}, + url = {https://www.connectedpapers.com/}, + note = {Accessed: 2026-01-23} } @misc{researchrabbit2026, - title = {ResearchRabbit}, - author = {{ResearchRabbit}}, - howpublished = {\url{https://www.researchrabbit.ai/}}, - note = {Accessed: 2026-01-23} + title = {ResearchRabbit}, + author = {{ResearchRabbit}}, + year = {2026}, + url = {https://www.researchrabbit.ai/}, + note = {Accessed: 2026-01-23} } @misc{litmaps2026, - title = {Litmaps}, - author = {{Litmaps}}, - howpublished = {\url{https://www.litmaps.com/}}, - note = {Accessed: 2026-01-23} + title = {Litmaps}, + author = {{Litmaps}}, + year = {2026}, + url = {https://www.litmaps.com/}, + note = {Accessed: 2026-01-23} } From dab4a7a1c8b8c5e42d41e0765024ed286e5f1e6b Mon Sep 17 00:00:00 2001 From: Ben Fuller <144738811+ben-n-fuller@users.noreply.github.com> Date: Wed, 1 Apr 2026 09:51:59 -0400 Subject: [PATCH 2/4] Revise software design principles section in paper --- paper/paper.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index d273665..da71523 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -47,12 +47,12 @@ Literature discovery platforms such as Connected Papers, ResearchRabbit, and Lit # Software Design Nexarag was designed around four principles: ease of use, flexibility, modularity, and privacy/security. -| Principle | Design choices in Nexarag | Tradeoffs | -| - | - | - | -| **Ease of use** | Nexarag uses familiar frontend technologies, including Angular, D3.js, Cytoscape.js, and PrimeNg, to provide an intuitive interface for building knowledge graphs, finding papers, interacting with LLMs, and visualizing results. Nexarag is fully containerized and can be deployed with a single command. | Multiple component libraries increases frontend maintenance overhead and onboarding cost for new contributors. Container deployments add complexity and additional software dependencies (Docker). | -| **Flexibility** | Nexarag integrates with Ollama and supports any embedding model or LLM that the user’s hardware can run, making it easy to switch models across tasks or adopt new ones as they become available. Users can also connect their preferred LLM or coding agent through the built-in MCP server. | Users have more choices but also more responsibility for hardware configuration, model selection, and staying up-to-date on relevant tools and architectures. | -| **Modularity** | The system is organized as distinct services for the REST API, Neo4j knowledge graph, MCP server, and frontend application, connected through a RabbitMQ messaging backbone. This supports horizontal scaling and reduces the blast radius of changes made within any single service. | Service decomposition improves scalability and isolates changes, but increases deployment complexity, inter-service coordination, and operational overhead. | -| **Privacy and security** | Nexarag supports on-premises, air-gapped deployment, providing a level of privacy and security that cloud-based applications typically cannot offer. | Air-gapped deployments can offer heightened security and privacy, but place more burden on the user for hardware configuration, deployment, and maintenance. Local compute resources may also be limited compared to cloud services. | +| Design choices in Nexarag | Tradeoffs | +| - | - | +| **Ease of use**: Nexarag uses familiar frontend technologies, including Angular, D3.js, Cytoscape.js, and PrimeNg, to provide an intuitive interface for building knowledge graphs, finding papers, interacting with LLMs, and visualizing results. Nexarag is fully containerized and can be deployed with a single command. | Multiple component libraries increases frontend maintenance overhead and onboarding cost for new contributors. Container deployments add complexity and additional software dependencies (Docker). | +| **Flexibility**: Nexarag integrates with Ollama and supports any embedding model or LLM that the user’s hardware can run, making it easy to switch models across tasks or adopt new ones as they become available. Users can also connect their preferred LLM or coding agent through the built-in MCP server. | Users have more choices but also more responsibility for hardware configuration, model selection, and staying up-to-date on relevant tools and architectures. | +| **Modularity**: The system is organized as distinct services for the REST API, Neo4j knowledge graph, MCP server, and frontend application, connected through a RabbitMQ messaging backbone. This supports horizontal scaling and reduces the blast radius of changes made within any single service. | Service decomposition improves scalability and isolates changes, but increases deployment complexity, inter-service coordination, and operational overhead. | +| **Privacy and security**: Nexarag supports on-premises, air-gapped deployment, providing a level of privacy and security that cloud-based applications typically cannot offer. | Air-gapped deployments can offer heightened security and privacy, but place more burden on the user for hardware configuration, deployment, and maintenance. Local compute resources may also be limited compared to cloud services. | # Software overview From 6c1a70e29a69eee665bb7d8f4509a68c0a1b5cc7 Mon Sep 17 00:00:00 2001 From: Ben Fuller <144738811+ben-n-fuller@users.noreply.github.com> Date: Wed, 8 Apr 2026 07:57:44 -0600 Subject: [PATCH 3/4] Modify author affiliations and section titles Updated author affiliations and section headers in the paper. --- paper/paper.md | 73 ++++++++++++++------------------------------------ 1 file changed, 20 insertions(+), 53 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index da71523..2ecc594 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -10,15 +10,15 @@ authors: - name: Thomas J. Kerby affiliation: 1 - name: Benjamin N. Fuller - affiliation: 3 - - name: Kevin R. Moon affiliation: 2 + - name: Kevin R. Moon + affiliation: 3 affiliations: - name: Brigham Young University, Provo, UT index: 1 - - name: Utah State University, Logan, UT - index: 2 - name: Independent Researcher + index: 2 + - name: Utah State University, Logan, UT index: 3 date: 2025-11-04 bibliography: paper.bib @@ -29,13 +29,13 @@ bibliography: paper.bib Large language models (LLMs) are widely used in research workflows but struggle with hallucinations, short context windows, and weak reproducibility in literature reviews [@Ji2023; @Huang2025]. Nexarag is a modular, open‑source platform that lets researchers curate, visualize, and share custom knowledge graphs (KGs) from academic sources stored in Neo4j [@neo4j2024database]. Through native support for the Model Context Protocol (MCP), any MCP‑compatible LLM can access these curated KGs for controllable, reproducible context injection [@anthropic2024mcp; @mcp2024github]—including fully private, air‑gapped deployments via containers [@boettiger2015docker]—so teams can explore literature more deeply and transparently. Nexarag provides interactive graph/semantic visualizations using Cytoscape.js and D3 [@franz2023cytoscape; @bostock2011d3]. -## Statement of need +# Statement of need Retrieval-augmented generation (RAG) has become a standard approach for knowledge-intensive NLP [@Lewis2020; @Guu2020; @gao2023retrieval], but systems built primarily on embedding-based similarity [@Lewis2020; @Guu2020; @reimers-2019-sentence-bert] can miss long-range semantic structure and cross-document relationships, especially in long-context and multi-document settings [@wang-etal-2024-leave; @gao2023retrieval]. For literature synthesis and related research workflows, this limits transparency, controllability, and reproducibility. Knowledge graphs address part of this problem by representing entities and relations explicitly, enabling path-based queries and more interpretable reasoning over document collections [@reinanda2020knowledge; @sahlab2022knowledge; @Xu2024]. However, existing KG-based tooling is often either proprietary or too technically demanding for routine research use. Nexarag addresses this gap with a researcher-friendly, self-hostable platform for constructing and curating literature knowledge graphs and exposing them to language models through MCP [@anthropic2024mcp]. -## State of the field +# State of the field Open-source RAG frameworks such as LangChain, LlamaIndex, and Haystack provide reusable components for ingestion, indexing, and retrieve-then-generate pipelines [@langchain2026rag; @llamaindex2026rag; @haystack2026]. They are effective developer libraries, but they are mainly designed for assembling application-specific pipelines in code. @@ -44,8 +44,9 @@ Graph-augmented retrieval tools extend this ecosystem. LlamaIndex includes a Kno Literature discovery platforms such as Connected Papers, ResearchRabbit, and Litmaps support citation exploration and related-work discovery [@connectedpapers2026; @researchrabbit2026; @litmaps2026], but they do not provide a researcher-owned, versionable graph substrate for controlled LLM experiments. Nexarag sits between these tool families by packaging persistent Neo4j-based graph construction, interactive curation, and standardized model access through MCP into a self-hostable research application [@anthropic2024mcp; @mcp2024github]. It complements existing RAG and GraphRAG libraries by turning graph-based context construction into a reusable and shareable research workflow with user-friendly UI tools, visualization, and pluggable MCP integration. -# Software Design -Nexarag was designed around four principles: ease of use, flexibility, modularity, and privacy/security. +# Software design +Nexarag was designed around four principles: ease of use, flexibility, modularity, and privacy/security. The system uses a containerized, microservices design orchestrated with Docker Compose [@merkel2014docker; @docker2024]. Primary services include: a FastAPI service for HTTP coordination [@fastapi2024], a Neo4j database for graph storage [@neo4j2024database], and a Knowledge Graph service for document processing/embeddings/AI tasks. Services communicate asynchronously via RabbitMQ, enabling horizontal scaling [@rabbitmq2024]. + | Design choices in Nexarag | Tradeoffs | | - | - | @@ -55,58 +56,24 @@ Nexarag was designed around four principles: ease of use, flexibility, modularit | **Privacy and security**: Nexarag supports on-premises, air-gapped deployment, providing a level of privacy and security that cloud-based applications typically cannot offer. | Air-gapped deployments can offer heightened security and privacy, but place more burden on the user for hardware configuration, deployment, and maintenance. Local compute resources may also be limited compared to cloud services. | -# Software overview - -**Core capabilities.** Nexarag provides: (i) automated KG construction from BibTeX, paper lists, search queries, and citation expansion (Semantic Scholar integration) [@Kinney2023TheSS; @semanticscholar2024api]; (ii) Neo4j‑backed storage and Cypher querying [@neo4j2024database]; (iii) interactive graph and semantic visualizations (Cytoscape.js and D3.js) [@franz2023cytoscape; @bostock2011d3]; and (iv) an AI “Talk To Your Data” interface that supports both simple retrieve‑and‑generate and ReAct‑style agentic workflows [@yao2022react]. - -**Architecture.** The system uses a containerized, microservices design orchestrated with Docker Compose [@merkel2014docker; @docker2024]. Primary services include: a FastAPI service for HTTP coordination [@fastapi2024], a Neo4j database for graph storage [@neo4j2024database], and a Knowledge Graph service for document processing/embeddings/AI tasks. Services communicate asynchronously via RabbitMQ, enabling horizontal scaling [@rabbitmq2024]. - -**MCP integration.** Nexarag ships an MCP‑compatible server that exposes graph querying, semantic search over embedded content, and external search via Semantic Scholar to any MCP‑enabled LLM (local via Ollama or remote via hosted providers) [@anthropic2024mcp; @mcp2024github; @ollama2024; @openai2023api]. This standardizes context delivery and promotes reproducible prompt‑driven research workflows. - -**Install & minimal run.** (see repository docs for full instructions) - -```bash -# CPU example -docker compose -f docker-compose.cpu.yml up -d -# or on macOS -docker compose -f docker-compose.macos.yml up -d -``` - -Optionally pull local models for embedding/LLM integration with Ollama [@ollama2024]; for example, a long‑context text embedder like Nomic Embed [@nussbaum2025nomic]: - -```bash -# inside the Ollama container or on macOS host -ollama pull nomic-embed-text:v1.5 -ollama pull gemma3:1b -``` - -**Repository:** [https://github.com/KevinMoonLab/Nexarag](https://github.com/KevinMoonLab/Nexarag) - -**License:** GNU General Public License v3.0. - -## Use cases - -- **Reproducible literature reviews.** Build a KG from a seed set (e.g., via BibTeX), expand by citations, and generate a structured review through the MCP interface [@sahlab2022knowledge]. -- **Private research contexts.** Run entirely offline (air‑gapped) with local LLMs for sensitive domains (e.g., healthcare, legal, proprietary research) [@boettiger2015docker]. -- **Collaborative curation.** Share/export/import graphs across teams to support longitudinal projects. - -## Quality control - -Nexarag emphasizes verifiable operation through containerized deployment and a guided quick start [@boettiger2015docker]. Reviewers can launch the full stack with Docker Compose, query/persist KGs in Neo4j, and exercise end‑to‑end flows (semantic search, citation expansion, MCP tools). A worked MCP chat transcript and an automatically generated literature review illustrate that the system’s graph building, retrieval, and reporting features execute as described. The repository includes example datasets/notebooks and scripts for running tests where applicable, supporting broader reproducibility goals in research practice [@rothacher2023eleven]. +Nexarag brings these design choices together in a research platform that offers: +* Automated KG construction from BibTeX, paper lists, search queries, and citation expansion (Semantic Scholar integration) [@Kinney2023TheSS; @semanticscholar2024api]. +* Interactive graph and semantic visualizations (Cytoscape.js and D3.js) [@franz2023cytoscape; @bostock2011d3]. +* An AI “Talk To Your Data” interface that supports both simple retrieve‑and‑generate and ReAct‑style agentic workflows [@yao2022react]. +* An MCP‑compatible server that exposes graph querying, semantic search over embedded content, and external search via Semantic Scholar to any MCP‑enabled LLM (local via Ollama or remote via hosted providers) [@anthropic2024mcp; @mcp2024github; @ollama2024; @openai2023api]. +* Neo4j‑backed storage and Cypher querying [@neo4j2024database]. +* Tools to share/export/import graphs across teams to support longitudinal projects. +* Entirely offline (air‑gapped) deployment with local LLMs for sensitive domains (e.g., healthcare, legal, proprietary research) [@boettiger2015docker]. # Research impact statement -Nexarag addresses a growing need in LLM-assisted research for transparent, reproducible, and inspectable context construction beyond embedding-only retrieval. While many RAG systems remain opaque and difficult to reproduce, Nexarag operationalizes knowledge-graph–based context building in a form that researchers can deploy locally, inspect visually, and share across projects. By combining Neo4j-backed knowledge graphs with standardized access through the Model Context Protocol (MCP), the software provides a reproducible bridge between structured scholarly knowledge and LLM-driven analysis. - -Although Nexarag is a relatively new project and has not yet accumulated extensive downstream citations, it demonstrates credible near-term research impact through its design, documentation, and reproducible reference materials. The repository includes end-to-end examples that reproduce literature expansion, graph construction, semantic querying, and LLM-mediated synthesis from fixed inputs, allowing independent researchers to verify behavior and compare results across models and deployment environments. Containerized deployment and air-gapped operation further support use in domains where reproducibility, auditability, or data sensitivity are critical. - -Nexarag is positioned to serve as shared research infrastructure for studies on retrieval-augmented generation, knowledge-graph–augmented reasoning, and AI-assisted literature review workflows. Its model-agnostic design, enabled by MCP, allows researchers to interchange local or API-hosted LLMs while holding the underlying knowledge graph and retrieval logic fixed. This supports a direct comparison of LLM behavior under identical, graph-derived contexts, facilitating methodological research on controllability, hallucination reduction, and long-context reasoning. By lowering the technical barrier to building, inspecting, and sharing reproducible knowledge graph contexts, Nexarag enables researchers to move beyond ad hoc, model-coupled RAG pipelines toward more transparent and portable AI-assisted research practices. +Nexarag addresses a growing need in LLM-assisted research for transparent, reproducible, and inspectable context construction beyond embedding-only retrieval by operationalizing knowledge-graph–based context building in a locally deployable, visually inspectable form that can be shared across projects while lowering the technical barrier to constructing, inspecting, and reusing reproducible graph-based contexts. By combining Neo4j-backed knowledge graphs with standardized model access through the Model Context Protocol (MCP), it creates a reproducible bridge between structured scholarly knowledge and LLM-driven analysis while remaining model-agnostic, allowing researchers to interchange local or API-hosted LLMs without changing the underlying knowledge graph or retrieval logic. This makes Nexarag well suited as shared research infrastructure for retrieval-augmented generation, knowledge-graph–augmented reasoning, and AI-assisted literature review, enabling direct comparison of model behavior under identical graph-derived contexts and facilitating methodological work on controllability, hallucination reduction, and long-context reasoning. # AI usage disclosure Generative AI tools were used in the development of the software, supporting code reviews, providing minor features in the frontend, and identifying and fixing bugs. Generative AI tools were also used to generate some of the documentation, and assisted with paper authoring. We primarily used: -* ChatGPT with the GPT-4o model for writing tasks -* Claude Code with the Sonnet 4 model for coding tasks +* ChatGPT with the GPT-4o model for writing tasks. +* Claude Code with the Sonnet 4 model for coding tasks. All AI-generated material was explicitly reviewed by at least one author, and all major design decisions were formalized by multiple authors. From 6356f13194c4a879525eba832ee98b5623c01f4e Mon Sep 17 00:00:00 2001 From: Ben Fuller <144738811+ben-n-fuller@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:03:55 -0600 Subject: [PATCH 4/4] Document Nexarag features and design choices Added features and capabilities of Nexarag research platform. --- paper/paper.md | 1 + 1 file changed, 1 insertion(+) diff --git a/paper/paper.md b/paper/paper.md index 2ecc594..717e570 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -57,6 +57,7 @@ Nexarag was designed around four principles: ease of use, flexibility, modularit Nexarag brings these design choices together in a research platform that offers: + * Automated KG construction from BibTeX, paper lists, search queries, and citation expansion (Semantic Scholar integration) [@Kinney2023TheSS; @semanticscholar2024api]. * Interactive graph and semantic visualizations (Cytoscape.js and D3.js) [@franz2023cytoscape; @bostock2011d3]. * An AI “Talk To Your Data” interface that supports both simple retrieve‑and‑generate and ReAct‑style agentic workflows [@yao2022react].