Skip to content

Commit 5b23ae7

Browse files
piehmrstork
andauthored
feat: allow to scan secrets without buffering whole lines (#6318)
* test: replace snapshots with explicit assertions for secrets scanning * test: add failing OOM case to secret scanning * feat: allow to scan secrets without buffering whole lines --------- Co-authored-by: Mateusz Bocian <mrstork@users.noreply.github.com>
1 parent 59a72ae commit 5b23ae7

File tree

8 files changed

+811
-660
lines changed

8 files changed

+811
-660
lines changed

packages/build/src/plugins_core/secrets_scanning/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const coreStep: CoreStepFunction = async function ({
3030
netlifyConfig,
3131
explicitSecretKeys,
3232
enhancedSecretScan,
33+
featureFlags,
3334
systemLog,
3435
deployId,
3536
api,
@@ -38,6 +39,7 @@ const coreStep: CoreStepFunction = async function ({
3839

3940
const passedSecretKeys = (explicitSecretKeys || '').split(',')
4041
const envVars = netlifyConfig.build.environment as Record<string, unknown>
42+
const useMinimalChunks = featureFlags?.secret_scanning_minimal_chunks
4143

4244
systemLog?.({ passedSecretKeys, buildDir })
4345

@@ -109,6 +111,7 @@ const coreStep: CoreStepFunction = async function ({
109111
filePaths,
110112
enhancedScanning: enhancedSecretScan && enhancedScanningEnabledInEnv,
111113
omitValuesFromEnhancedScan: getOmitValuesFromEnhancedScanForEnhancedScanFromEnv(envVars),
114+
useMinimalChunks,
112115
})
113116

114117
secretMatches = scanResults.matches.filter((match) => !match.enhancedMatch)

packages/build/src/plugins_core/secrets_scanning/utils.ts

Lines changed: 254 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ interface ScanArgs {
1919
filePaths: string[]
2020
enhancedScanning?: boolean
2121
omitValuesFromEnhancedScan?: unknown[]
22+
useMinimalChunks: boolean
2223
}
2324

2425
interface MatchResult {
@@ -146,54 +147,49 @@ const likelySecretRegex = new RegExp(
146147
)
147148

148149
/**
149-
* Checks a line of text for likely secrets based on known prefixes and patterns.
150+
* Checks a chunk of text for likely secrets based on known prefixes and patterns.
150151
* The function works by:
151-
* 1. Splitting the line into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
152+
* 1. Splitting the chunk into tokens using quotes, whitespace, equals signs, colons, and commas as delimiters
152153
* 2. For each token, checking if it matches our secret pattern:
153154
* - Must start (^) with one of our known prefixes (e.g. aws_, github_pat_, etc)
154155
* - Must be followed by at least MIN_CHARS_AFTER_PREFIX non-whitespace characters
155156
* - Must extend to the end ($) of the token
156157
*
157-
* For example, given the line: secretKey='aws_123456789012345678'
158+
* For example, given the chunk: secretKey='aws_123456789012345678'
158159
* 1. It's split into tokens: ['secretKey', 'aws_123456789012345678']
159160
* 2. Each token is checked against the regex pattern:
160161
* - 'secretKey' doesn't match (doesn't start with a known prefix)
161162
* - 'aws_123456789012345678' matches (starts with 'aws_' and has sufficient length)
162163
*
163-
* @param line The line of text to check
164-
* @param file The file path where this line was found
165-
* @param lineNumber The line number in the file
166-
* @param omitValuesFromEnhancedScan Optional array of values to exclude from matching
167-
* @returns Array of matches found in the line
168164
*/
169165
export function findLikelySecrets({
170-
line,
171-
file,
172-
lineNumber,
166+
text,
173167
omitValuesFromEnhancedScan = [],
174168
}: {
175-
line: string
176-
file: string
177-
lineNumber: number
169+
/**
170+
* Text to check
171+
*/
172+
text: string
173+
/**
174+
* Optional array of values to exclude from matching
175+
*/
178176
omitValuesFromEnhancedScan?: unknown[]
179-
}): MatchResult[] {
180-
if (!line) return []
177+
}): { index: number; prefix: string }[] {
178+
if (!text) return []
181179

182-
const matches: MatchResult[] = []
180+
const matches: ReturnType<typeof findLikelySecrets> = []
183181
let match: RegExpExecArray | null
184182
const allOmittedValues = [...omitValuesFromEnhancedScan, ...SAFE_LISTED_VALUES]
185183

186-
while ((match = likelySecretRegex.exec(line)) !== null) {
184+
while ((match = likelySecretRegex.exec(text)) !== null) {
187185
const token = match.groups?.token
188186
const prefix = match.groups?.prefix
189187
if (!token || !prefix || allOmittedValues.includes(token)) {
190188
continue
191189
}
192190
matches.push({
193-
file,
194-
lineNumber,
195-
key: prefix,
196-
enhancedMatch: true,
191+
prefix,
192+
index: match.index,
197193
})
198194
}
199195

@@ -279,6 +275,7 @@ export async function scanFilesForKeyValues({
279275
base,
280276
enhancedScanning,
281277
omitValuesFromEnhancedScan = [],
278+
useMinimalChunks = false,
282279
}: ScanArgs): Promise<ScanResults> {
283280
const scanResults: ScanResults = {
284281
matches: [],
@@ -309,6 +306,8 @@ export async function scanFilesForKeyValues({
309306

310307
let settledPromises: PromiseSettledResult<MatchResult[]>[] = []
311308

309+
const searchStream = useMinimalChunks ? searchStreamMinimalChunks : searchStreamReadline
310+
312311
// process the scanning in batches to not run into memory issues by
313312
// processing all files at the same time.
314313
while (filePaths.length > 0) {
@@ -333,19 +332,24 @@ export async function scanFilesForKeyValues({
333332
return scanResults
334333
}
335334

336-
const searchStream = ({
337-
basePath,
338-
file,
339-
keyValues,
340-
enhancedScanning,
341-
omitValuesFromEnhancedScan = [],
342-
}: {
335+
type SearchStreamOptions = {
343336
basePath: string
344337
file: string
345338
keyValues: Record<string, string[]>
346339
enhancedScanning?: boolean
347340
omitValuesFromEnhancedScan?: unknown[]
348-
}): Promise<MatchResult[]> => {
341+
}
342+
343+
/**
344+
* Search stream implementation using node:readline
345+
*/
346+
const searchStreamReadline = ({
347+
basePath,
348+
file,
349+
keyValues,
350+
enhancedScanning,
351+
omitValuesFromEnhancedScan = [],
352+
}: SearchStreamOptions): Promise<MatchResult[]> => {
349353
return new Promise((resolve, reject) => {
350354
const filePath = path.resolve(basePath, file)
351355

@@ -382,7 +386,14 @@ const searchStream = ({
382386
lineNumber++
383387
if (typeof line === 'string') {
384388
if (enhancedScanning) {
385-
matches.push(...findLikelySecrets({ line, file, lineNumber, omitValuesFromEnhancedScan }))
389+
matches.push(
390+
...findLikelySecrets({ text: line, omitValuesFromEnhancedScan }).map(({ prefix }) => ({
391+
key: prefix,
392+
file,
393+
lineNumber,
394+
enhancedMatch: true,
395+
})),
396+
)
386397
}
387398
if (maxMultiLineCount > 1) {
388399
lines.push(line)
@@ -472,6 +483,218 @@ const searchStream = ({
472483
})
473484
}
474485

486+
/**
487+
* Search stream implementation using just read stream that allows to buffer less content
488+
*/
489+
const searchStreamMinimalChunks = ({
490+
basePath,
491+
file,
492+
keyValues,
493+
enhancedScanning,
494+
omitValuesFromEnhancedScan = [],
495+
}: SearchStreamOptions): Promise<MatchResult[]> => {
496+
return new Promise((resolve, reject) => {
497+
const matches: MatchResult[] = []
498+
499+
const keyVals: string[] = ([] as string[]).concat(...Object.values(keyValues))
500+
501+
// determine longest value that we will search for - needed to determine minimal size of rolling buffer
502+
const maxValLength = Math.max(
503+
0,
504+
// explicit secrets
505+
...keyVals.map((v) => v.length),
506+
...(enhancedScanning
507+
? [
508+
// omitted likely secrets (after finding likely secret we check if it should be omitted, so we need to capture at least size of omitted values)
509+
...omitValuesFromEnhancedScan.map((v) => (typeof v === 'string' ? v.length : 0)),
510+
// minimum length needed to find likely secret
511+
...LIKELY_SECRET_PREFIXES.map((v) => v.length + MIN_CHARS_AFTER_PREFIX),
512+
]
513+
: []),
514+
)
515+
516+
if (maxValLength === 0) {
517+
// no non-empty values to scan for
518+
resolve(matches)
519+
return
520+
}
521+
522+
const filePath = path.resolve(basePath, file)
523+
524+
const inStream = createReadStream(filePath)
525+
526+
function getKeyForValue(val) {
527+
let key = ''
528+
for (const [secretKeyName, valuePermutations] of Object.entries(keyValues)) {
529+
if (valuePermutations.includes(val)) {
530+
key = secretKeyName
531+
}
532+
}
533+
return key
534+
}
535+
536+
let buffer = ''
537+
538+
let newLinesIndexesInCurrentBuffer: number[] | null = null
539+
function getCurrentBufferNewLineIndexes() {
540+
if (newLinesIndexesInCurrentBuffer === null) {
541+
newLinesIndexesInCurrentBuffer = [] as number[]
542+
let newLineIndex = -1
543+
while ((newLineIndex = buffer.indexOf('\n', newLineIndex + 1)) !== -1) {
544+
newLinesIndexesInCurrentBuffer.push(newLineIndex)
545+
}
546+
}
547+
548+
return newLinesIndexesInCurrentBuffer
549+
}
550+
551+
/**
552+
* Amount of characters that were fully processed. Used to determine absolute position of current rolling buffer
553+
* in the file.
554+
*/
555+
let processedCharacters = 0
556+
/**
557+
* Amount of lines that were fully processed. Used to determine absolute line number of matches in current rolling buffer.
558+
*/
559+
let processedLines = 0
560+
/**
561+
* Map keeping track of found secrets in current file. Used to prevent reporting same secret+position multiple times.
562+
* Needed because rolling buffer might retain same secret in multiple passes.
563+
*/
564+
const foundIndexes = new Map<string, Set<number>>()
565+
/**
566+
* We report given secret at most once per line, so we keep track lines we already reported for given secret.
567+
*/
568+
const foundLines = new Map<string, Set<number>>()
569+
570+
/**
571+
* Calculate absolute line number in a file for given match in the current rolling buffer.
572+
*/
573+
function getLineNumberForMatchInTheBuffer({ indexInBuffer, key }: { indexInBuffer: number; key: string }) {
574+
const absolutePositionInFile = processedCharacters + indexInBuffer
575+
576+
// check if we already handled match for given key in this position
577+
let foundIndexesForKey = foundIndexes.get(key)
578+
if (!foundIndexesForKey?.has(absolutePositionInFile)) {
579+
// ensure we track match for this key and position to not report it again in future passes
580+
if (!foundIndexesForKey) {
581+
foundIndexesForKey = new Set<number>()
582+
foundIndexes.set(key, foundIndexesForKey)
583+
}
584+
foundIndexesForKey.add(absolutePositionInFile)
585+
586+
// calculate line number based on amount of fully processed lines and position of line breaks in current buffer
587+
let lineNumber = processedLines + 1
588+
for (const newLineIndex of getCurrentBufferNewLineIndexes()) {
589+
if (indexInBuffer > newLineIndex) {
590+
lineNumber++
591+
} else {
592+
break
593+
}
594+
}
595+
596+
// check if we already handled match for given key in this line
597+
let foundLinesForKey = foundLines.get(key)
598+
if (!foundLinesForKey?.has(lineNumber)) {
599+
if (!foundLinesForKey) {
600+
foundLinesForKey = new Set<number>()
601+
foundLines.set(key, foundLinesForKey)
602+
}
603+
foundLinesForKey.add(lineNumber)
604+
605+
// only report line number if we didn't report it yet for this key
606+
return lineNumber
607+
}
608+
}
609+
}
610+
611+
function processBuffer() {
612+
for (const valVariant of keyVals) {
613+
let indexInBuffer = -1
614+
while ((indexInBuffer = buffer.indexOf(valVariant, indexInBuffer + 1)) !== -1) {
615+
const key = getKeyForValue(valVariant)
616+
const lineNumber = getLineNumberForMatchInTheBuffer({
617+
indexInBuffer,
618+
key,
619+
})
620+
621+
if (typeof lineNumber === 'number') {
622+
matches.push({
623+
file,
624+
lineNumber,
625+
key,
626+
enhancedMatch: false,
627+
})
628+
}
629+
}
630+
}
631+
632+
if (enhancedScanning) {
633+
const likelySecrets = findLikelySecrets({ text: buffer, omitValuesFromEnhancedScan })
634+
for (const { index, prefix } of likelySecrets) {
635+
const lineNumber = getLineNumberForMatchInTheBuffer({
636+
indexInBuffer: index,
637+
key: prefix,
638+
})
639+
640+
if (typeof lineNumber === 'number') {
641+
matches.push({
642+
file,
643+
lineNumber,
644+
key: prefix,
645+
enhancedMatch: true,
646+
})
647+
}
648+
}
649+
}
650+
}
651+
652+
inStream.on('data', function (chunk) {
653+
buffer += chunk.toString()
654+
655+
// reset new line positions in current buffer
656+
newLinesIndexesInCurrentBuffer = null
657+
658+
if (buffer.length > maxValLength) {
659+
// only process if buffer is large enough to contain longest secret, if final chunk isn't large enough
660+
// it will be processed in `close` event handler
661+
processBuffer()
662+
663+
// we will keep maxValLength characters in the buffer, surplus of characters at this point is fully processed
664+
const charactersInBufferThatWereFullyProcessed = buffer.length - maxValLength
665+
processedCharacters += charactersInBufferThatWereFullyProcessed
666+
667+
// advance processed lines
668+
for (const newLineIndex of getCurrentBufferNewLineIndexes()) {
669+
if (newLineIndex < charactersInBufferThatWereFullyProcessed) {
670+
processedLines++
671+
} else {
672+
break
673+
}
674+
}
675+
676+
// Keep the last part of the buffer to handle split values across chunks
677+
buffer = buffer.slice(charactersInBufferThatWereFullyProcessed)
678+
}
679+
})
680+
681+
inStream.on('error', function (error: any) {
682+
if (error?.code === 'EISDIR') {
683+
// file path is a directory - do nothing
684+
resolve(matches)
685+
} else {
686+
reject(error)
687+
}
688+
})
689+
690+
inStream.on('close', function () {
691+
// process any remaining buffer content
692+
processBuffer()
693+
resolve(matches)
694+
})
695+
})
696+
}
697+
475698
/**
476699
* ScanResults are all of the finds for all keys and their disparate locations. Scanning is
477700
* async in streams so order can change a lot. Some matches are the result of an env var explictly being marked as secret,

0 commit comments

Comments
 (0)