GerevAI
diff --git a/‎.dockerignore‎
Lines changed: 3 additions & 1 deletion b/‎.dockerignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/deploy.yaml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/deploy.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 20 additions & 13 deletions b/‎README.md‎
Lines changed: 20 additions & 13 deletions
diff --git a/‎app/data_source_api/utils.py‎
Lines changed: 20 additions & 0 deletions b/‎app/data_source_api/utils.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎app/data_sources/bookstack.py‎
Lines changed: 215 additions & 0 deletions b/‎app/data_sources/bookstack.py‎
Lines changed: 215 additions & 0 deletions
diff --git a/‎app/data_sources/confluence.py‎
Lines changed: 24 additions & 7 deletions b/‎app/data_sources/confluence.py‎
Lines changed: 24 additions & 7 deletions
@@ -1,3 +1,5 @@
 ui/node_modules
 app/venv
-storage
+storage
+app/.env
+.env
@@ -24,6 +24,15 @@ jobs:
  - name: Checkout code
  uses: actions/checkout@v2
 
+ - name: 🐷 TruffleHog OSS
+ uses: trufflesecurity/trufflehog@v3.29.1
+ with:
+ path: ./
+ base: ${{ github.event.repository.default_branch }}
+ head: HEAD
+ extra_args: --debug --only-verified
+
+
  # Use Caching for npm
  - name: Cache node modules
  uses: actions/cache@v2
 
@@ -1,14 +1,18 @@
-![first image](./images/api.gif)
-**Find any conversation, doc, or internal page in seconds** ⏲️⚡️ 
-**Join 100+** devs by hosting your own gerev instance, become a **hero** within your org! 💪
-
 ## Join Discord for early access code!
 
 ![Discord Shield](https://discordapp.com/api/guilds/1060085859497549844/widget.png?style=shield) 
 
- [Join here!](https://discord.gg/aMRRcmhAdW)
-# Search focused on devs
-Devs are the best early adopters, they adopt technology early and aid in spreading it to their non-technical peers. That's why gerev is focused on making a product dev's adore and love ❤️
+ [Join here!](https://discord.gg/NKhTX7JZAF)
+ 
+# **New!** 🎉 ChatGPT plugin support!
+**Connect Gerev as a plugin to ChatGPT -> ask questions about your entire organization.** 
+See more awesome plugins [awesome-chatgpt-plugins](https://github.com/GerevAI/awesome-chatgpt-plugins)
+![ChatGPT Integration](./images/integ.jpeg)
+
+# Search engine for your organization!
+![first image](./images/api.gif)
+**Find any conversation, doc, or internal page in seconds** ⏲️⚡️ 
+**Join 100+** devs by hosting your own gerev instance, become a **hero** within your org! 💪
 
 ## Made for devs 👨‍💻
 - **For finding internal pages _fast_ ⚡️**
@@ -25,17 +29,20 @@ Coming Soon...
  - [x] Confluence
  - [x] Google Drive (Docs, .docx, .pptx)
  - [X] Confluence Cloud - by [@bryan-pakulski](https://github.com/bryan-pakulski) :pray: 
- - [ ] Bookstack
- - [ ] RocketChat (in PR)
- - [ ] Gitlab Issues (In PR)
- - [ ] Notion (In Progress...)
+ - [X] Bookstack - by [@flifloo](https://github.com/flifloo) :pray:
+ - [X] Mattermost - by [@itaykal](https://github.com/Itaykal) :pray:
+ - [X] RocketChat - by [@flifloo](https://github.com/flifloo) :pray:
+ - [ ] Gitlab Issues (In PR :pray:)
+ - [ ] Zendesk (In PR :pray:)
+ - [ ] Notion (In Progress... :pray:)
  - [ ] Microsoft Teams
  - [ ] Sharepoint
+ - [ ] Jira
 
 :pray: - by the community 
- 
 
-## Natural Langauge
+
+## Natural Language
 Enables searching using natural language. such as `"How to do X"`, `"how to connect to Y"`, `"Do we support Z"`
 
 --- 
 
@@ -1,6 +1,12 @@
+import base64
 import importlib
 import logging
 import concurrent.futures
+from functools import lru_cache
+from io import BytesIO
+from typing import Optional
+
+import requests
 
 logger = logging.getLogger(__name__)
 
@@ -37,3 +43,17 @@ def parse_with_workers(method_name: callable, items: list, **kwargs):
  e = w.exception()
  if e:
  logging.exception("Worker failed", exc_info=e)
+
+
+@lru_cache(maxsize=512)
+def get_confluence_user_image(image_url: str, token: str) -> Optional[str]:
+ try:
+ if "anonymous.svg" in image_url:
+ image_url = image_url.replace(".svg", ".png")
+
+ response = requests.get(url=image_url, timeout=1, headers={'Accept': 'application/json',
+ "Authorization": f"Bearer {token}"})
+ image_bytes = BytesIO(response.content)
+ return f"data:image/jpeg;base64,{base64.b64encode(image_bytes.getvalue()).decode()}"
+ except:
+ logger.warning(f"Failed to get confluence user image {image_url}")
@@ -0,0 +1,215 @@
+import logging
+from datetime import datetime
+from typing import List, Dict
+
+from data_source_api.basic_document import BasicDocument, DocumentType
+from data_source_api.base_data_source import BaseDataSource, ConfigField, HTMLInputType
+from data_source_api.exception import InvalidDataSourceConfig
+from data_source_api.utils import parse_with_workers
+from index_queue import IndexQueue
+from parsers.html import html_to_text
+from pydantic import BaseModel
+from requests import Session, HTTPError
+from requests.auth import AuthBase
+from urllib.parse import urljoin
+from time import sleep
+
+
+logger = logging.getLogger(__name__)
+
+
+class BookStackAuth(AuthBase):
+ def __init__(self, token_id, token_secret, header_key="Authorization"):
+ self.header_key = header_key
+ self.token_id = token_id
+ self.token_secret = token_secret
+
+ def __call__(self, r):
+ r.headers[self.header_key] = f"Token {self.token_id}:{self.token_secret}"
+ return r
+
+
+class BookStack(Session):
+ def __init__(self, url: str, token_id: str, token_secret: str, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.base_url = url
+ self.auth = BookStackAuth(token_id, token_secret)
+ self.rate_limit_reach = False
+
+ def request(self, method, url_path, *args, **kwargs):
+ while self.rate_limit_reach:
+ sleep(1)
+
+ url = urljoin(self.base_url, url_path)
+ r = super().request(method, url, verify=False, *args, **kwargs)
+
+ if r.status_code != 200:
+ if r.status_code == 429:
+ if not self.rate_limit_reach:
+ logger.info("API rate limit reach, waiting...")
+ self.rate_limit_reach = True
+ sleep(60)
+ self.rate_limit_reach = False
+ logger.info("Done waiting for the API rate limit")
+ return self.request(method, url, verify=False, *args, **kwargs)
+ r.raise_for_status()
+ return r
+
+ def get_list(self, url: str, count: int = 500, sort: str = None, filters: Dict = None):
+ # Add filter[...] to keys, avoiding the insertion of unwanted parameters
+ if filters is not None:
+ filters = {f"filter[{k}]": v for k, v in filters.items()}
+ else:
+ filters = {}
+
+ data = []
+ records = 0
+ total = 1 # Set 1 to enter the loop
+ while records < total:
+ r = self.get(url, params={"count": count, "offset": records, "sort": sort, **filters},
+ headers={"Content-Type": "application/json"})
+ json = r.json()
+ data += json.get("data")
+ records = len(data)
+ total = json.get("total")
+ return data
+
+ def get_all_books(self) -> List[Dict]:
+ return self.get_list("/api/books", sort="+updated_at")
+
+ def get_all_pages_from_book(self, book) -> List[Dict]:
+ pages = self.get_list("/api/pages", sort="+updated_at", filters={"book_id": book["id"]})
+
+ # Add parent book object to each page
+ for page in pages:
+ page.update({"book": book})
+
+ return pages
+
+ def get_page(self, page_id: int):
+ r = self.get(f"/api/pages/{page_id}", headers={"Content-Type": "application/json"})
+ return r.json()
+
+ def get_user(self, user_id: int):
+ try:
+ return self.get(f"/api/users/{user_id}", headers={"Content-Type": "application/json"}).json()
+ # If the user lack the privileges to make this call, return None
+ except HTTPError:
+ return None
+
+
+class BookStackConfig(BaseModel):
+ url: str
+ token_id: str
+ token_secret: str
+
+
+class BookstackDataSource(BaseDataSource):
+ @staticmethod
+ def get_config_fields() -> List[ConfigField]:
+ return [
+ ConfigField(label="BookStack instance URL", name="url"),
+ ConfigField(label="Token ID", name="token_id", input_type=HTMLInputType.PASSWORD),
+ ConfigField(label="Token Secret", name="token_secret", input_type=HTMLInputType.PASSWORD)
+ ]
+
+ @classmethod
+ def get_display_name(cls) -> str:
+ return "BookStack"
+
+ @staticmethod
+ def list_books(book_stack: BookStack) -> List[Dict]:
+ # Usually the book_stack connection fails, so we retry a few times
+ retries = 3
+ for i in range(retries):
+ try:
+ return book_stack.get_all_books()
+ except Exception as e:
+ logging.error(f"BookStack connection failed: {e}")
+ if i == retries - 1:
+ raise e
+
+ @staticmethod
+ def validate_config(config: Dict) -> None:
+ try:
+ parsed_config = BookStackConfig(**config)
+ book_stack = BookStack(url=parsed_config.url, token_id=parsed_config.token_id,
+ token_secret=parsed_config.token_secret)
+ BookstackDataSource.list_books(book_stack=book_stack)
+ except Exception as e:
+ raise InvalidDataSourceConfig from e
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ book_stack_config = BookStackConfig(**self._config)
+ self._book_stack = BookStack(url=book_stack_config.url, token_id=book_stack_config.token_id,
+ token_secret=book_stack_config.token_secret)
+
+ def _list_books(self) -> List[Dict]:
+ logger.info("Listing books with BookStack")
+ return BookstackDataSource.list_books(book_stack=self._book_stack)
+
+ def _feed_new_documents(self) -> None:
+ logger.info("Feeding new documents with BookStack")
+
+ books = self._list_books()
+ raw_docs = []
+ for book in books:
+ raw_docs.extend(self._list_book_pages(book))
+
+ parse_with_workers(self._parse_documents_worker, raw_docs)
+
+ def _parse_documents_worker(self, raw_docs: List[Dict]):
+ logger.info(f"Worker parsing {len(raw_docs)} documents")
+
+ parsed_docs = []
+ total_fed = 0
+ for raw_page in raw_docs:
+ last_modified = datetime.strptime(raw_page["updated_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ if last_modified < self._last_index_time:
+ continue
+
+ page_id = raw_page["id"]
+ page_content = self._book_stack.get_page(page_id)
+ author_name = page_content["created_by"]["name"]
+
+ author_image_url = ""
+ author = self._book_stack.get_user(raw_page["created_by"])
+ if author:
+ author_image_url = author["avatar_url"]
+
+ plain_text = html_to_text(page_content["html"])
+
+ url = urljoin(self._config.get('url'), f"/books/{raw_page['book_slug']}/page/{raw_page['slug']}")
+
+ parsed_docs.append(BasicDocument(title=raw_page["name"],
+ content=plain_text,
+ author=author_name,
+ author_image_url=author_image_url,
+ timestamp=last_modified,
+ id=page_id,
+ data_source_id=self._data_source_id,
+ location=raw_page["book"]["name"],
+ url=url,
+ type=DocumentType.DOCUMENT))
+ if len(parsed_docs) >= 50:
+ total_fed += len(parsed_docs)
+ IndexQueue.get_instance().put(docs=parsed_docs)
+ parsed_docs = []
+
+ IndexQueue.get_instance().put(docs=parsed_docs)
+ total_fed += len(parsed_docs)
+ if total_fed > 0:
+ logging.info(f"Worker fed {total_fed} documents")
+
+ def _list_book_pages(self, book: Dict) -> List[Dict]:
+ logger.info(f"Getting documents from book {book['name']} ({book['id']})")
+ return self._book_stack.get_all_pages_from_book(book)
+
+
+# if __name__ == "__main__":
+# import os
+# config = {"url": os.environ["BOOKSTACK_URL"], "token_id": os.environ["BOOKSTACK_TOKEN_ID"],
+# "token_secret": os.environ["BOOKSTACK_TOKEN_SECRET"]}
+# book_stack = BookstackDataSource(config=config, data_source_id=0)
+# book_stack._feed_new_documents()
@@ -1,4 +1,3 @@
-import concurrent.futures
 import logging
 from datetime import datetime
 from typing import List, Dict
@@ -9,11 +8,14 @@
 from data_source_api.base_data_source import BaseDataSource, ConfigField, HTMLInputType
 from data_source_api.exception import InvalidDataSourceConfig
 from data_source_api.utils import parse_with_workers
-from indexing_queue import IndexingQueue
+from index_queue import IndexQueue
 from parsers.html import html_to_text
 from pydantic import BaseModel
 
 
+logger = logging.getLogger(__name__)
+
+
 class ConfluenceConfig(BaseModel):
  url: str
  token: str
@@ -29,12 +31,12 @@ def get_config_fields() -> List[ConfigField]:
  ]
 
  @staticmethod
- def list_spaces(confluence: Confluence) -> List[Dict]:
+ def list_spaces(confluence: Confluence, start=0) -> List[Dict]:
  # Usually the confluence connection fails, so we retry a few times
  retries = 3
  for i in range(retries):
  try:
- return confluence.get_all_spaces(expand='status')['results']
+ return confluence.get_all_spaces(expand='status', start=start)['results']
  except Exception as e:
  logging.error(f'Confluence connection failed: {e}')
  if i == retries - 1:
@@ -55,9 +57,24 @@ def __init__(self, *args, **kwargs):
  self._confluence = Confluence(url=confluence_config.url, token=confluence_config.token, verify_ssl=False)
 
  def _list_spaces(self) -> List[Dict]:
- return ConfluenceDataSource.list_spaces(confluence=self._confluence)
+ logger.info('Listing spaces')
+
+ spaces = []
+ start = 0
+ while True:
+ new_spaces = ConfluenceDataSource.list_spaces(confluence=self._confluence, start=start)
+ if len(new_spaces) == 0:
+ break
+
+ spaces.extend(new_spaces)
+ start += len(new_spaces)
+
+ logger.info(f'Found {len(spaces)} spaces')
+ return spaces
 
  def _feed_new_documents(self) -> None:
+ logger.info('Feeding new documents with Confluence')
+
  spaces = self._list_spaces()
  raw_docs = []
  for space in spaces:
@@ -98,10 +115,10 @@ def _parse_documents_worker(self, raw_docs: List[Dict]):
  type=DocumentType.DOCUMENT))
  if len(parsed_docs) >= 50:
  total_fed += len(parsed_docs)
- IndexingQueue.get().feed(docs=parsed_docs)
+ IndexQueue.get_instance().put(docs=parsed_docs)
  parsed_docs = []
 
- IndexingQueue.get().feed(docs=parsed_docs)
+ IndexQueue.get_instance().put(docs=parsed_docs)
  total_fed += len(parsed_docs)
  if total_fed > 0:
  logging.info(f'Worker fed {total_fed} documents')