22import io
33import logging
44from datetime import datetime
5- from typing import Dict
5+ from typing import Dict , List
66
77from apiclient .discovery import build
88from googleapiclient .http import MediaIoBaseDownload
@@ -44,34 +44,53 @@ def __init__(self, *args, **kwargs):
4444 self ._credentials = ServiceAccountCredentials .from_json_keyfile_dict (self ._config , scopes = scopes )
4545 self ._http_auth = self ._credentials .authorize (Http ())
4646 self ._drive = build ('drive' , 'v3' , http = self ._http_auth )
47-
48- def _should_index_file (self , file ):
49- mime_types = [
47+
48+ self ._supported_mime_types = [
5049 'application/vnd.google-apps.document' ,
5150 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ,
5251 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
5352 ]
54- return file ['kind' ] == 'drive#file' and file ['mimeType' ] in mime_types
5553
56- def _feed_new_documents (self ) -> None :
57- files = self ._drive .files ().list (fields = 'files(kind,id,name,mimeType,owners,webViewLink,modifiedTime,parents)' ).execute ()
58- files = files ['files' ]
54+ def _should_index_file (self , file ):
55+ if file ['mimeType' ] not in self ._supported_mime_types :
56+ return False
57+
58+ last_modified = datetime .strptime (file ['modifiedTime' ], "%Y-%m-%dT%H:%M:%S.%fZ" )
59+ if last_modified < self ._last_index_time :
60+ return False
61+
62+ return True
63+
64+ def _index_files_from_drive (self , drive ) -> List [dict ]:
65+ is_shared_drive = drive ['id' ] is not None
66+
67+ print (f'Indexing drive { drive ["name" ]} ' )
68+
69+ kwargs = {
70+ 'corpora' : 'drive' ,
71+ 'driveId' : drive ['id' ],
72+ 'includeItemsFromAllDrives' : True ,
73+ 'supportsAllDrives' : True ,
74+ } if is_shared_drive else {}
75+
76+ # Todo: add pagination
77+ files = self ._drive .files ().list (
78+ fields = 'files(kind,id,name,mimeType,lastModifyingUser,webViewLink,modifiedTime,parents)' ,
79+ pageSize = 1000 ,
80+ ** kwargs
81+ ).execute ()['files' ]
82+
5983 files = [file for file in files if self ._should_index_file (file )]
84+
6085 documents = []
6186
62- logging .getLogger ().info (f'got { len (files )} documents from google drive.' )
87+ logging .getLogger ().info (f'got { len (files )} documents from drive { drive [ "name" ] } .' )
6388
6489 for file in files :
6590 logging .getLogger ().info (f'processing file { file ["name" ]} ' )
66- last_modified = datetime .strptime (file ['modifiedTime' ], "%Y-%m-%dT%H:%M:%S.%fZ" )
67- if last_modified < self ._last_index_time :
68- continue
6991
7092 file_id = file ['id' ]
71- file = self ._drive .files ().get (fileId = file_id ,
72- fields = 'id,name,mimeType,owners,webViewLink,modifiedTime,parents' ).execute ()
7393 file_to_download = file ['name' ]
74-
7594 if file ['mimeType' ] == 'application/vnd.google-apps.document' :
7695 content = self ._drive .files ().export (fileId = file_id , mimeType = 'text/html' ).execute ().decode ('utf-8' )
7796 content = html_to_text (content )
@@ -100,25 +119,38 @@ def _feed_new_documents(self) -> None:
100119 os .remove (file_to_download )
101120 except Exception as error :
102121 logging .exception (f'Error occured parsing file "{ file ["name" ]} " from google drive' )
103-
122+
104123 try :
105- parent = self ._drive .files ().get (fileId = file ['parents' ][0 ], fields = 'name' ).execute ()
124+ if is_shared_drive :
125+ parent = self ._drive .drives ().get (driveId = drive ['id' ], fields = 'name' ).execute ()
126+ else :
127+ parent = self ._drive .files ().get (fileId = file ['parents' ][0 ], fields = 'name' ).execute ()
106128 parent_name = parent ['name' ]
107129 except Exception as e :
108130 logging .exception (f"Error while getting folder name of google docs file { file ['name' ]} " )
109131 parent_name = ''
110132
133+ last_modified = datetime .strptime (file ['modifiedTime' ], "%Y-%m-%dT%H:%M:%S.%fZ" )
134+
111135 documents .append (BasicDocument (
112136 id = file_id ,
113137 data_source_id = self ._data_source_id ,
114138 type = DocumentType .DOCUMENT ,
115139 title = file ['name' ],
116140 content = content ,
117- author = file ['owners' ][ 0 ]['displayName' ],
118- author_image_url = file ['owners' ][ 0 ]['photoLink' ],
141+ author = file ['lastModifyingUser' ]['displayName' ],
142+ author_image_url = file ['lastModifyingUser' ]['photoLink' ],
119143 location = parent_name ,
120144 url = file ['webViewLink' ],
121145 timestamp = last_modified
122146 ))
123147
124148 IndexingQueue .get ().feed (documents )
149+
150+ def _get_all_drives (self ) -> List [dict ]:
151+ return [{'name' : 'My Drive' , 'id' : None }] \
152+ + self ._drive .drives ().list (fields = 'drives(id,name)' ).execute ()['drives' ]
153+
154+ def _feed_new_documents (self ) -> None :
155+ for drive in self ._get_all_drives ():
156+ self ._index_files_from_drive (drive )
0 commit comments