@@ -19,6 +19,7 @@ interface ScanArgs {
1919 filePaths : string [ ]
2020 enhancedScanning ?: boolean
2121 omitValuesFromEnhancedScan ?: unknown [ ]
22+ useMinimalChunks : boolean
2223}
2324
2425interface MatchResult {
@@ -146,54 +147,49 @@ const likelySecretRegex = new RegExp(
146147)
147148
148149/**
149- * Checks a line of text for likely secrets based on known prefixes and patterns.
150+ * Checks a chunk of text for likely secrets based on known prefixes and patterns.
150151 * The function works by:
151- * 1. Splitting the line into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
152+ * 1. Splitting the chunk into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
152153 * 2. For each token, checking if it matches our secret pattern:
153154 * - Must start (^) with one of our known prefixes (e.g. aws_, github_pat_, etc)
154155 * - Must be followed by at least MIN_CHARS_AFTER_PREFIX non-whitespace characters
155156 * - Must extend to the end ($) of the token
156157 *
157- * For example, given the line : secretKey='aws_123456789012345678'
158+ * For example, given the chunk : secretKey='aws_123456789012345678'
158159 * 1. It's split into tokens: ['secretKey', 'aws_123456789012345678']
159160 * 2. Each token is checked against the regex pattern:
160161 * - 'secretKey' doesn't match (doesn't start with a known prefix)
161162 * - 'aws_123456789012345678' matches (starts with 'aws_' and has sufficient length)
162163 *
163- * @param line The line of text to check
164- * @param file The file path where this line was found
165- * @param lineNumber The line number in the file
166- * @param omitValuesFromEnhancedScan Optional array of values to exclude from matching
167- * @returns Array of matches found in the line
168164 */
169165export function findLikelySecrets ( {
170- line,
171- file,
172- lineNumber,
166+ text,
173167 omitValuesFromEnhancedScan = [ ] ,
174168} : {
175- line : string
176- file : string
177- lineNumber : number
169+ /**
170+ * Text to check
171+ */
172+ text : string
173+ /**
174+ * Optional array of values to exclude from matching
175+ */
178176 omitValuesFromEnhancedScan ?: unknown [ ]
179- } ) : MatchResult [ ] {
180- if ( ! line ) return [ ]
177+ } ) : { index : number ; prefix : string } [ ] {
178+ if ( ! text ) return [ ]
181179
182- const matches : MatchResult [ ] = [ ]
180+ const matches : ReturnType < typeof findLikelySecrets > = [ ]
183181 let match : RegExpExecArray | null
184182 const allOmittedValues = [ ...omitValuesFromEnhancedScan , ...SAFE_LISTED_VALUES ]
185183
186- while ( ( match = likelySecretRegex . exec ( line ) ) !== null ) {
184+ while ( ( match = likelySecretRegex . exec ( text ) ) !== null ) {
187185 const token = match . groups ?. token
188186 const prefix = match . groups ?. prefix
189187 if ( ! token || ! prefix || allOmittedValues . includes ( token ) ) {
190188 continue
191189 }
192190 matches . push ( {
193- file,
194- lineNumber,
195- key : prefix ,
196- enhancedMatch : true ,
191+ prefix,
192+ index : match . index ,
197193 } )
198194 }
199195
@@ -279,6 +275,7 @@ export async function scanFilesForKeyValues({
279275 base,
280276 enhancedScanning,
281277 omitValuesFromEnhancedScan = [ ] ,
278+ useMinimalChunks = false ,
282279} : ScanArgs ) : Promise < ScanResults > {
283280 const scanResults : ScanResults = {
284281 matches : [ ] ,
@@ -309,6 +306,8 @@ export async function scanFilesForKeyValues({
309306
310307 let settledPromises : PromiseSettledResult < MatchResult [ ] > [ ] = [ ]
311308
309+ const searchStream = useMinimalChunks ? searchStreamMinimalChunks : searchStreamReadline
310+
312311 // process the scanning in batches to not run into memory issues by
313312 // processing all files at the same time.
314313 while ( filePaths . length > 0 ) {
@@ -333,19 +332,24 @@ export async function scanFilesForKeyValues({
333332 return scanResults
334333}
335334
336- const searchStream = ( {
337- basePath,
338- file,
339- keyValues,
340- enhancedScanning,
341- omitValuesFromEnhancedScan = [ ] ,
342- } : {
335+ type SearchStreamOptions = {
343336 basePath : string
344337 file : string
345338 keyValues : Record < string , string [ ] >
346339 enhancedScanning ?: boolean
347340 omitValuesFromEnhancedScan ?: unknown [ ]
348- } ) : Promise < MatchResult [ ] > => {
341+ }
342+
343+ /**
344+ * Search stream implementation using node:readline
345+ */
346+ const searchStreamReadline = ( {
347+ basePath,
348+ file,
349+ keyValues,
350+ enhancedScanning,
351+ omitValuesFromEnhancedScan = [ ] ,
352+ } : SearchStreamOptions ) : Promise < MatchResult [ ] > => {
349353 return new Promise ( ( resolve , reject ) => {
350354 const filePath = path . resolve ( basePath , file )
351355
@@ -382,7 +386,14 @@ const searchStream = ({
382386 lineNumber ++
383387 if ( typeof line === 'string' ) {
384388 if ( enhancedScanning ) {
385- matches . push ( ...findLikelySecrets ( { line, file, lineNumber, omitValuesFromEnhancedScan } ) )
389+ matches . push (
390+ ...findLikelySecrets ( { text : line , omitValuesFromEnhancedScan } ) . map ( ( { prefix } ) => ( {
391+ key : prefix ,
392+ file,
393+ lineNumber,
394+ enhancedMatch : true ,
395+ } ) ) ,
396+ )
386397 }
387398 if ( maxMultiLineCount > 1 ) {
388399 lines . push ( line )
@@ -472,6 +483,218 @@ const searchStream = ({
472483 } )
473484}
474485
486+ /**
487+ * Search stream implementation using just read stream that allows to buffer less content
488+ */
489+ const searchStreamMinimalChunks = ( {
490+ basePath,
491+ file,
492+ keyValues,
493+ enhancedScanning,
494+ omitValuesFromEnhancedScan = [ ] ,
495+ } : SearchStreamOptions ) : Promise < MatchResult [ ] > => {
496+ return new Promise ( ( resolve , reject ) => {
497+ const matches : MatchResult [ ] = [ ]
498+
499+ const keyVals : string [ ] = ( [ ] as string [ ] ) . concat ( ...Object . values ( keyValues ) )
500+
501+ // determine longest value that we will search for - needed to determine minimal size of rolling buffer
502+ const maxValLength = Math . max (
503+ 0 ,
504+ // explicit secrets
505+ ...keyVals . map ( ( v ) => v . length ) ,
506+ ...( enhancedScanning
507+ ? [
508+ // omitted likely secrets (after finding likely secret we check if it should be omitted, so we need to capture at least size of omitted values)
509+ ...omitValuesFromEnhancedScan . map ( ( v ) => ( typeof v === 'string' ? v . length : 0 ) ) ,
510+ // minimum length needed to find likely secret
511+ ...LIKELY_SECRET_PREFIXES . map ( ( v ) => v . length + MIN_CHARS_AFTER_PREFIX ) ,
512+ ]
513+ : [ ] ) ,
514+ )
515+
516+ if ( maxValLength === 0 ) {
517+ // no non-empty values to scan for
518+ resolve ( matches )
519+ return
520+ }
521+
522+ const filePath = path . resolve ( basePath , file )
523+
524+ const inStream = createReadStream ( filePath )
525+
526+ function getKeyForValue ( val ) {
527+ let key = ''
528+ for ( const [ secretKeyName , valuePermutations ] of Object . entries ( keyValues ) ) {
529+ if ( valuePermutations . includes ( val ) ) {
530+ key = secretKeyName
531+ }
532+ }
533+ return key
534+ }
535+
536+ let buffer = ''
537+
538+ let newLinesIndexesInCurrentBuffer : number [ ] | null = null
539+ function getCurrentBufferNewLineIndexes ( ) {
540+ if ( newLinesIndexesInCurrentBuffer === null ) {
541+ newLinesIndexesInCurrentBuffer = [ ] as number [ ]
542+ let newLineIndex = - 1
543+ while ( ( newLineIndex = buffer . indexOf ( '\n' , newLineIndex + 1 ) ) !== - 1 ) {
544+ newLinesIndexesInCurrentBuffer . push ( newLineIndex )
545+ }
546+ }
547+
548+ return newLinesIndexesInCurrentBuffer
549+ }
550+
551+ /**
552+ * Amount of characters that were fully processed. Used to determine absolute position of current rolling buffer
553+ * in the file.
554+ */
555+ let processedCharacters = 0
556+ /**
557+ * Amount of lines that were fully processed. Used to determine absolute line number of matches in current rolling buffer.
558+ */
559+ let processedLines = 0
560+ /**
561+ * Map keeping track of found secrets in current file. Used to prevent reporting same secret+position multiple times.
562+ * Needed because rolling buffer might retain same secret in multiple passes.
563+ */
564+ const foundIndexes = new Map < string , Set < number > > ( )
565+ /**
566+ * We report given secret at most once per line, so we keep track lines we already reported for given secret.
567+ */
568+ const foundLines = new Map < string , Set < number > > ( )
569+
570+ /**
571+ * Calculate absolute line number in a file for given match in the current rolling buffer.
572+ */
573+ function getLineNumberForMatchInTheBuffer ( { indexInBuffer, key } : { indexInBuffer : number ; key : string } ) {
574+ const absolutePositionInFile = processedCharacters + indexInBuffer
575+
576+ // check if we already handled match for given key in this position
577+ let foundIndexesForKey = foundIndexes . get ( key )
578+ if ( ! foundIndexesForKey ?. has ( absolutePositionInFile ) ) {
579+ // ensure we track match for this key and position to not report it again in future passes
580+ if ( ! foundIndexesForKey ) {
581+ foundIndexesForKey = new Set < number > ( )
582+ foundIndexes . set ( key , foundIndexesForKey )
583+ }
584+ foundIndexesForKey . add ( absolutePositionInFile )
585+
586+ // calculate line number based on amount of fully processed lines and position of line breaks in current buffer
587+ let lineNumber = processedLines + 1
588+ for ( const newLineIndex of getCurrentBufferNewLineIndexes ( ) ) {
589+ if ( indexInBuffer > newLineIndex ) {
590+ lineNumber ++
591+ } else {
592+ break
593+ }
594+ }
595+
596+ // check if we already handled match for given key in this line
597+ let foundLinesForKey = foundLines . get ( key )
598+ if ( ! foundLinesForKey ?. has ( lineNumber ) ) {
599+ if ( ! foundLinesForKey ) {
600+ foundLinesForKey = new Set < number > ( )
601+ foundLines . set ( key , foundLinesForKey )
602+ }
603+ foundLinesForKey . add ( lineNumber )
604+
605+ // only report line number if we didn't report it yet for this key
606+ return lineNumber
607+ }
608+ }
609+ }
610+
611+ function processBuffer ( ) {
612+ for ( const valVariant of keyVals ) {
613+ let indexInBuffer = - 1
614+ while ( ( indexInBuffer = buffer . indexOf ( valVariant , indexInBuffer + 1 ) ) !== - 1 ) {
615+ const key = getKeyForValue ( valVariant )
616+ const lineNumber = getLineNumberForMatchInTheBuffer ( {
617+ indexInBuffer,
618+ key,
619+ } )
620+
621+ if ( typeof lineNumber === 'number' ) {
622+ matches . push ( {
623+ file,
624+ lineNumber,
625+ key,
626+ enhancedMatch : false ,
627+ } )
628+ }
629+ }
630+ }
631+
632+ if ( enhancedScanning ) {
633+ const likelySecrets = findLikelySecrets ( { text : buffer , omitValuesFromEnhancedScan } )
634+ for ( const { index, prefix } of likelySecrets ) {
635+ const lineNumber = getLineNumberForMatchInTheBuffer ( {
636+ indexInBuffer : index ,
637+ key : prefix ,
638+ } )
639+
640+ if ( typeof lineNumber === 'number' ) {
641+ matches . push ( {
642+ file,
643+ lineNumber,
644+ key : prefix ,
645+ enhancedMatch : true ,
646+ } )
647+ }
648+ }
649+ }
650+ }
651+
652+ inStream . on ( 'data' , function ( chunk ) {
653+ buffer += chunk . toString ( )
654+
655+ // reset new line positions in current buffer
656+ newLinesIndexesInCurrentBuffer = null
657+
658+ if ( buffer . length > maxValLength ) {
659+ // only process if buffer is large enough to contain longest secret, if final chunk isn't large enough
660+ // it will be processed in `close` event handler
661+ processBuffer ( )
662+
663+ // we will keep maxValLength characters in the buffer, surplus of characters at this point is fully processed
664+ const charactersInBufferThatWereFullyProcessed = buffer . length - maxValLength
665+ processedCharacters += charactersInBufferThatWereFullyProcessed
666+
667+ // advance processed lines
668+ for ( const newLineIndex of getCurrentBufferNewLineIndexes ( ) ) {
669+ if ( newLineIndex < charactersInBufferThatWereFullyProcessed ) {
670+ processedLines ++
671+ } else {
672+ break
673+ }
674+ }
675+
676+ // Keep the last part of the buffer to handle split values across chunks
677+ buffer = buffer . slice ( charactersInBufferThatWereFullyProcessed )
678+ }
679+ } )
680+
681+ inStream . on ( 'error' , function ( error : any ) {
682+ if ( error ?. code === 'EISDIR' ) {
683+ // file path is a directory - do nothing
684+ resolve ( matches )
685+ } else {
686+ reject ( error )
687+ }
688+ } )
689+
690+ inStream . on ( 'close' , function ( ) {
691+ // process any remaining buffer content
692+ processBuffer ( )
693+ resolve ( matches )
694+ } )
695+ } )
696+ }
697+
475698/**
476699 * ScanResults are all of the finds for all keys and their disparate locations. Scanning is
477700 * async in streams so order can change a lot. Some matches are the result of an env var explictly being marked as secret,
0 commit comments