Skip to content

Commit 0dd862b

Browse files
committed
Merged
2 parents 86e6503 + 7a63c54 commit 0dd862b

32 files changed

+903
-137
lines changed

.dockerignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
ui/node_modules
22
app/venv
3-
storage
3+
storage
4+
app/.env
5+
.env

.github/workflows/deploy.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@ jobs:
2424
- name: Checkout code
2525
uses: actions/checkout@v2
2626

27+
- name: 🐷 TruffleHog OSS
28+
uses: trufflesecurity/trufflehog@v3.29.1
29+
with:
30+
path: ./
31+
base: ${{ github.event.repository.default_branch }}
32+
head: HEAD
33+
extra_args: --debug --only-verified
34+
35+
2736
# Use Caching for npm
2837
- name: Cache node modules
2938
uses: actions/cache@v2

README.md

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
1-
![first image](./images/api.gif)
2-
**Find any conversation, doc, or internal page in seconds** ⏲️⚡️
3-
**Join 100+** devs by hosting your own gerev instance, become a **hero** within your org! 💪
4-
51
## Join Discord for early access code!
62

73
![Discord Shield](https://discordapp.com/api/guilds/1060085859497549844/widget.png?style=shield)
84

9-
[Join here!](https://discord.gg/aMRRcmhAdW)
10-
# Search focused on devs
11-
Devs are the best early adopters, they adopt technology early and aid in spreading it to their non-technical peers. That's why gerev is focused on making a product dev's adore and love ❤️
5+
[Join here!](https://discord.gg/NKhTX7JZAF)
6+
7+
# **New!** 🎉 ChatGPT plugin support!
8+
**Connect Gerev as a plugin to ChatGPT -> ask questions about your entire organization.**
9+
See more awesome plugins [awesome-chatgpt-plugins](https://github.com/GerevAI/awesome-chatgpt-plugins)
10+
![ChatGPT Integration](./images/integ.jpeg)
11+
12+
# Search engine for your organization!
13+
![first image](./images/api.gif)
14+
**Find any conversation, doc, or internal page in seconds** ⏲️⚡️
15+
**Join 100+** devs by hosting your own gerev instance, become a **hero** within your org! 💪
1216

1317
## Made for devs 👨‍💻
1418
- **For finding internal pages _fast_ ⚡️**
@@ -25,17 +29,20 @@ Coming Soon...
2529
- [x] Confluence
2630
- [x] Google Drive (Docs, .docx, .pptx)
2731
- [X] Confluence Cloud - by [@bryan-pakulski](https://github.com/bryan-pakulski) :pray:
28-
- [ ] Bookstack
29-
- [ ] RocketChat (in PR)
30-
- [ ] Gitlab Issues (In PR)
31-
- [ ] Notion (In Progress...)
32+
- [X] Bookstack - by [@flifloo](https://github.com/flifloo) :pray:
33+
- [X] Mattermost - by [@itaykal](https://github.com/Itaykal) :pray:
34+
- [X] RocketChat - by [@flifloo](https://github.com/flifloo) :pray:
35+
- [ ] Gitlab Issues (In PR :pray:)
36+
- [ ] Zendesk (In PR :pray:)
37+
- [ ] Notion (In Progress... :pray:)
3238
- [ ] Microsoft Teams
3339
- [ ] Sharepoint
40+
- [ ] Jira
3441

3542
:pray: - by the community
36-
3743

38-
## Natural Langauge
44+
45+
## Natural Language
3946
Enables searching using natural language. such as `"How to do X"`, `"how to connect to Y"`, `"Do we support Z"`
4047

4148
---

app/data_source_api/utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
1+
import base64
12
import importlib
23
import logging
34
import concurrent.futures
5+
from functools import lru_cache
6+
from io import BytesIO
7+
from typing import Optional
8+
9+
import requests
410

511
logger = logging.getLogger(__name__)
612

@@ -37,3 +43,17 @@ def parse_with_workers(method_name: callable, items: list, **kwargs):
3743
e = w.exception()
3844
if e:
3945
logging.exception("Worker failed", exc_info=e)
46+
47+
48+
@lru_cache(maxsize=512)
49+
def get_confluence_user_image(image_url: str, token: str) -> Optional[str]:
50+
try:
51+
if "anonymous.svg" in image_url:
52+
image_url = image_url.replace(".svg", ".png")
53+
54+
response = requests.get(url=image_url, timeout=1, headers={'Accept': 'application/json',
55+
"Authorization": f"Bearer {token}"})
56+
image_bytes = BytesIO(response.content)
57+
return f"data:image/jpeg;base64,{base64.b64encode(image_bytes.getvalue()).decode()}"
58+
except:
59+
logger.warning(f"Failed to get confluence user image {image_url}")

app/data_sources/bookstack.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
import logging
2+
from datetime import datetime
3+
from typing import List, Dict
4+
5+
from data_source_api.basic_document import BasicDocument, DocumentType
6+
from data_source_api.base_data_source import BaseDataSource, ConfigField, HTMLInputType
7+
from data_source_api.exception import InvalidDataSourceConfig
8+
from data_source_api.utils import parse_with_workers
9+
from index_queue import IndexQueue
10+
from parsers.html import html_to_text
11+
from pydantic import BaseModel
12+
from requests import Session, HTTPError
13+
from requests.auth import AuthBase
14+
from urllib.parse import urljoin
15+
from time import sleep
16+
17+
18+
logger = logging.getLogger(__name__)
19+
20+
21+
class BookStackAuth(AuthBase):
22+
def __init__(self, token_id, token_secret, header_key="Authorization"):
23+
self.header_key = header_key
24+
self.token_id = token_id
25+
self.token_secret = token_secret
26+
27+
def __call__(self, r):
28+
r.headers[self.header_key] = f"Token {self.token_id}:{self.token_secret}"
29+
return r
30+
31+
32+
class BookStack(Session):
33+
def __init__(self, url: str, token_id: str, token_secret: str, *args, **kwargs):
34+
super().__init__(*args, **kwargs)
35+
self.base_url = url
36+
self.auth = BookStackAuth(token_id, token_secret)
37+
self.rate_limit_reach = False
38+
39+
def request(self, method, url_path, *args, **kwargs):
40+
while self.rate_limit_reach:
41+
sleep(1)
42+
43+
url = urljoin(self.base_url, url_path)
44+
r = super().request(method, url, verify=False, *args, **kwargs)
45+
46+
if r.status_code != 200:
47+
if r.status_code == 429:
48+
if not self.rate_limit_reach:
49+
logger.info("API rate limit reach, waiting...")
50+
self.rate_limit_reach = True
51+
sleep(60)
52+
self.rate_limit_reach = False
53+
logger.info("Done waiting for the API rate limit")
54+
return self.request(method, url, verify=False, *args, **kwargs)
55+
r.raise_for_status()
56+
return r
57+
58+
def get_list(self, url: str, count: int = 500, sort: str = None, filters: Dict = None):
59+
# Add filter[...] to keys, avoiding the insertion of unwanted parameters
60+
if filters is not None:
61+
filters = {f"filter[{k}]": v for k, v in filters.items()}
62+
else:
63+
filters = {}
64+
65+
data = []
66+
records = 0
67+
total = 1 # Set 1 to enter the loop
68+
while records < total:
69+
r = self.get(url, params={"count": count, "offset": records, "sort": sort, **filters},
70+
headers={"Content-Type": "application/json"})
71+
json = r.json()
72+
data += json.get("data")
73+
records = len(data)
74+
total = json.get("total")
75+
return data
76+
77+
def get_all_books(self) -> List[Dict]:
78+
return self.get_list("/api/books", sort="+updated_at")
79+
80+
def get_all_pages_from_book(self, book) -> List[Dict]:
81+
pages = self.get_list("/api/pages", sort="+updated_at", filters={"book_id": book["id"]})
82+
83+
# Add parent book object to each page
84+
for page in pages:
85+
page.update({"book": book})
86+
87+
return pages
88+
89+
def get_page(self, page_id: int):
90+
r = self.get(f"/api/pages/{page_id}", headers={"Content-Type": "application/json"})
91+
return r.json()
92+
93+
def get_user(self, user_id: int):
94+
try:
95+
return self.get(f"/api/users/{user_id}", headers={"Content-Type": "application/json"}).json()
96+
# If the user lack the privileges to make this call, return None
97+
except HTTPError:
98+
return None
99+
100+
101+
class BookStackConfig(BaseModel):
102+
url: str
103+
token_id: str
104+
token_secret: str
105+
106+
107+
class BookstackDataSource(BaseDataSource):
108+
@staticmethod
109+
def get_config_fields() -> List[ConfigField]:
110+
return [
111+
ConfigField(label="BookStack instance URL", name="url"),
112+
ConfigField(label="Token ID", name="token_id", input_type=HTMLInputType.PASSWORD),
113+
ConfigField(label="Token Secret", name="token_secret", input_type=HTMLInputType.PASSWORD)
114+
]
115+
116+
@classmethod
117+
def get_display_name(cls) -> str:
118+
return "BookStack"
119+
120+
@staticmethod
121+
def list_books(book_stack: BookStack) -> List[Dict]:
122+
# Usually the book_stack connection fails, so we retry a few times
123+
retries = 3
124+
for i in range(retries):
125+
try:
126+
return book_stack.get_all_books()
127+
except Exception as e:
128+
logging.error(f"BookStack connection failed: {e}")
129+
if i == retries - 1:
130+
raise e
131+
132+
@staticmethod
133+
def validate_config(config: Dict) -> None:
134+
try:
135+
parsed_config = BookStackConfig(**config)
136+
book_stack = BookStack(url=parsed_config.url, token_id=parsed_config.token_id,
137+
token_secret=parsed_config.token_secret)
138+
BookstackDataSource.list_books(book_stack=book_stack)
139+
except Exception as e:
140+
raise InvalidDataSourceConfig from e
141+
142+
def __init__(self, *args, **kwargs):
143+
super().__init__(*args, **kwargs)
144+
book_stack_config = BookStackConfig(**self._config)
145+
self._book_stack = BookStack(url=book_stack_config.url, token_id=book_stack_config.token_id,
146+
token_secret=book_stack_config.token_secret)
147+
148+
def _list_books(self) -> List[Dict]:
149+
logger.info("Listing books with BookStack")
150+
return BookstackDataSource.list_books(book_stack=self._book_stack)
151+
152+
def _feed_new_documents(self) -> None:
153+
logger.info("Feeding new documents with BookStack")
154+
155+
books = self._list_books()
156+
raw_docs = []
157+
for book in books:
158+
raw_docs.extend(self._list_book_pages(book))
159+
160+
parse_with_workers(self._parse_documents_worker, raw_docs)
161+
162+
def _parse_documents_worker(self, raw_docs: List[Dict]):
163+
logger.info(f"Worker parsing {len(raw_docs)} documents")
164+
165+
parsed_docs = []
166+
total_fed = 0
167+
for raw_page in raw_docs:
168+
last_modified = datetime.strptime(raw_page["updated_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
169+
if last_modified < self._last_index_time:
170+
continue
171+
172+
page_id = raw_page["id"]
173+
page_content = self._book_stack.get_page(page_id)
174+
author_name = page_content["created_by"]["name"]
175+
176+
author_image_url = ""
177+
author = self._book_stack.get_user(raw_page["created_by"])
178+
if author:
179+
author_image_url = author["avatar_url"]
180+
181+
plain_text = html_to_text(page_content["html"])
182+
183+
url = urljoin(self._config.get('url'), f"/books/{raw_page['book_slug']}/page/{raw_page['slug']}")
184+
185+
parsed_docs.append(BasicDocument(title=raw_page["name"],
186+
content=plain_text,
187+
author=author_name,
188+
author_image_url=author_image_url,
189+
timestamp=last_modified,
190+
id=page_id,
191+
data_source_id=self._data_source_id,
192+
location=raw_page["book"]["name"],
193+
url=url,
194+
type=DocumentType.DOCUMENT))
195+
if len(parsed_docs) >= 50:
196+
total_fed += len(parsed_docs)
197+
IndexQueue.get_instance().put(docs=parsed_docs)
198+
parsed_docs = []
199+
200+
IndexQueue.get_instance().put(docs=parsed_docs)
201+
total_fed += len(parsed_docs)
202+
if total_fed > 0:
203+
logging.info(f"Worker fed {total_fed} documents")
204+
205+
def _list_book_pages(self, book: Dict) -> List[Dict]:
206+
logger.info(f"Getting documents from book {book['name']} ({book['id']})")
207+
return self._book_stack.get_all_pages_from_book(book)
208+
209+
210+
# if __name__ == "__main__":
211+
# import os
212+
# config = {"url": os.environ["BOOKSTACK_URL"], "token_id": os.environ["BOOKSTACK_TOKEN_ID"],
213+
# "token_secret": os.environ["BOOKSTACK_TOKEN_SECRET"]}
214+
# book_stack = BookstackDataSource(config=config, data_source_id=0)
215+
# book_stack._feed_new_documents()

app/data_sources/confluence.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import concurrent.futures
21
import logging
32
from datetime import datetime
43
from typing import List, Dict
@@ -9,11 +8,14 @@
98
from data_source_api.base_data_source import BaseDataSource, ConfigField, HTMLInputType
109
from data_source_api.exception import InvalidDataSourceConfig
1110
from data_source_api.utils import parse_with_workers
12-
from indexing_queue import IndexingQueue
11+
from index_queue import IndexQueue
1312
from parsers.html import html_to_text
1413
from pydantic import BaseModel
1514

1615

16+
logger = logging.getLogger(__name__)
17+
18+
1719
class ConfluenceConfig(BaseModel):
1820
url: str
1921
token: str
@@ -29,12 +31,12 @@ def get_config_fields() -> List[ConfigField]:
2931
]
3032

3133
@staticmethod
32-
def list_spaces(confluence: Confluence) -> List[Dict]:
34+
def list_spaces(confluence: Confluence, start=0) -> List[Dict]:
3335
# Usually the confluence connection fails, so we retry a few times
3436
retries = 3
3537
for i in range(retries):
3638
try:
37-
return confluence.get_all_spaces(expand='status')['results']
39+
return confluence.get_all_spaces(expand='status', start=start)['results']
3840
except Exception as e:
3941
logging.error(f'Confluence connection failed: {e}')
4042
if i == retries - 1:
@@ -55,9 +57,24 @@ def __init__(self, *args, **kwargs):
5557
self._confluence = Confluence(url=confluence_config.url, token=confluence_config.token, verify_ssl=False)
5658

5759
def _list_spaces(self) -> List[Dict]:
58-
return ConfluenceDataSource.list_spaces(confluence=self._confluence)
60+
logger.info('Listing spaces')
61+
62+
spaces = []
63+
start = 0
64+
while True:
65+
new_spaces = ConfluenceDataSource.list_spaces(confluence=self._confluence, start=start)
66+
if len(new_spaces) == 0:
67+
break
68+
69+
spaces.extend(new_spaces)
70+
start += len(new_spaces)
71+
72+
logger.info(f'Found {len(spaces)} spaces')
73+
return spaces
5974

6075
def _feed_new_documents(self) -> None:
76+
logger.info('Feeding new documents with Confluence')
77+
6178
spaces = self._list_spaces()
6279
raw_docs = []
6380
for space in spaces:
@@ -98,10 +115,10 @@ def _parse_documents_worker(self, raw_docs: List[Dict]):
98115
type=DocumentType.DOCUMENT))
99116
if len(parsed_docs) >= 50:
100117
total_fed += len(parsed_docs)
101-
IndexingQueue.get().feed(docs=parsed_docs)
118+
IndexQueue.get_instance().put(docs=parsed_docs)
102119
parsed_docs = []
103120

104-
IndexingQueue.get().feed(docs=parsed_docs)
121+
IndexQueue.get_instance().put(docs=parsed_docs)
105122
total_fed += len(parsed_docs)
106123
if total_fed > 0:
107124
logging.info(f'Worker fed {total_fed} documents')

0 commit comments

Comments
 (0)