Skip to content

Commit 4df07f7

Browse files
committed
fix google drive
1 parent ee02847 commit 4df07f7

File tree

1 file changed

+51
-19
lines changed

1 file changed

+51
-19
lines changed

app/data_sources/google_drive.py

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import io
33
import logging
44
from datetime import datetime
5-
from typing import Dict
5+
from typing import Dict, List
66

77
from apiclient.discovery import build
88
from googleapiclient.http import MediaIoBaseDownload
@@ -44,34 +44,53 @@ def __init__(self, *args, **kwargs):
4444
self._credentials = ServiceAccountCredentials.from_json_keyfile_dict(self._config, scopes=scopes)
4545
self._http_auth = self._credentials.authorize(Http())
4646
self._drive = build('drive', 'v3', http=self._http_auth)
47-
48-
def _should_index_file(self, file):
49-
mime_types = [
47+
48+
self._supported_mime_types = [
5049
'application/vnd.google-apps.document',
5150
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
5251
'application/vnd.openxmlformats-officedocument.presentationml.presentation'
5352
]
54-
return file['kind'] == 'drive#file' and file['mimeType'] in mime_types
5553

56-
def _feed_new_documents(self) -> None:
57-
files = self._drive.files().list(fields='files(kind,id,name,mimeType,owners,webViewLink,modifiedTime,parents)').execute()
58-
files = files['files']
54+
def _should_index_file(self, file):
55+
if file['mimeType'] not in self._supported_mime_types:
56+
return False
57+
58+
last_modified = datetime.strptime(file['modifiedTime'], "%Y-%m-%dT%H:%M:%S.%fZ")
59+
if last_modified < self._last_index_time:
60+
return False
61+
62+
return True
63+
64+
def _index_files_from_drive(self, drive) -> List[dict]:
65+
is_shared_drive = drive['id'] is not None
66+
67+
print(f'Indexing drive {drive["name"]}')
68+
69+
kwargs = {
70+
'corpora': 'drive',
71+
'driveId': drive['id'],
72+
'includeItemsFromAllDrives': True,
73+
'supportsAllDrives': True,
74+
} if is_shared_drive else {}
75+
76+
# Todo: add pagination
77+
files = self._drive.files().list(
78+
fields='files(kind,id,name,mimeType,lastModifyingUser,webViewLink,modifiedTime,parents)',
79+
pageSize=1000,
80+
**kwargs
81+
).execute()['files']
82+
5983
files = [file for file in files if self._should_index_file(file)]
84+
6085
documents = []
6186

62-
logging.getLogger().info(f'got {len(files)} documents from google drive.')
87+
logging.getLogger().info(f'got {len(files)} documents from drive {drive["name"]}.')
6388

6489
for file in files:
6590
logging.getLogger().info(f'processing file {file["name"]}')
66-
last_modified = datetime.strptime(file['modifiedTime'], "%Y-%m-%dT%H:%M:%S.%fZ")
67-
if last_modified < self._last_index_time:
68-
continue
6991

7092
file_id = file['id']
71-
file = self._drive.files().get(fileId=file_id,
72-
fields='id,name,mimeType,owners,webViewLink,modifiedTime,parents').execute()
7393
file_to_download = file['name']
74-
7594
if file['mimeType'] == 'application/vnd.google-apps.document':
7695
content = self._drive.files().export(fileId=file_id, mimeType='text/html').execute().decode('utf-8')
7796
content = html_to_text(content)
@@ -100,25 +119,38 @@ def _feed_new_documents(self) -> None:
100119
os.remove(file_to_download)
101120
except Exception as error:
102121
logging.exception(f'Error occured parsing file "{file["name"]}" from google drive')
103-
122+
104123
try:
105-
parent = self._drive.files().get(fileId=file['parents'][0], fields='name').execute()
124+
if is_shared_drive:
125+
parent = self._drive.drives().get(driveId=drive['id'], fields='name').execute()
126+
else:
127+
parent = self._drive.files().get(fileId=file['parents'][0], fields='name').execute()
106128
parent_name = parent['name']
107129
except Exception as e:
108130
logging.exception(f"Error while getting folder name of google docs file {file['name']}")
109131
parent_name = ''
110132

133+
last_modified = datetime.strptime(file['modifiedTime'], "%Y-%m-%dT%H:%M:%S.%fZ")
134+
111135
documents.append(BasicDocument(
112136
id=file_id,
113137
data_source_id=self._data_source_id,
114138
type=DocumentType.DOCUMENT,
115139
title=file['name'],
116140
content=content,
117-
author=file['owners'][0]['displayName'],
118-
author_image_url=file['owners'][0]['photoLink'],
141+
author=file['lastModifyingUser']['displayName'],
142+
author_image_url=file['lastModifyingUser']['photoLink'],
119143
location=parent_name,
120144
url=file['webViewLink'],
121145
timestamp=last_modified
122146
))
123147

124148
IndexingQueue.get().feed(documents)
149+
150+
def _get_all_drives(self) -> List[dict]:
151+
return [{'name': 'My Drive', 'id': None}] \
152+
+ self._drive.drives().list(fields='drives(id,name)').execute()['drives']
153+
154+
def _feed_new_documents(self) -> None:
155+
for drive in self._get_all_drives():
156+
self._index_files_from_drive(drive)

0 commit comments

Comments
 (0)