Skip to content

Commit afaef65

Browse files
authored
Simplify the import regex (microsoft#6535)
1 parent 47db2f7 commit afaef65

File tree

6 files changed

+48
-60
lines changed

6 files changed

+48
-60
lines changed

news/2 Fixes/6319.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Simplify the import regex to minimize performance overhead.

src/client/telemetry/constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ export enum EventName {
7272
LINTER_NOT_INSTALLED_PROMPT = 'LINTER_NOT_INSTALLED_PROMPT',
7373
CONFIGURE_AVAILABLE_LINTER_PROMPT = 'CONFIGURE_AVAILABLE_LINTER_PROMPT',
7474
HASHED_PACKAGE_NAME = 'HASHED_PACKAGE_NAME',
75+
HASHED_PACKAGE_PERF = 'HASHED_PACKAGE_PERF',
7576

7677
JEDI_MEMORY = 'JEDI_MEMORY'
7778
}

src/client/telemetry/importTracker.ts

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import { inject, injectable } from 'inversify';
66
import * as path from 'path';
77
import { TextDocument } from 'vscode';
8-
import { sendTelemetryEvent } from '.';
8+
import { captureTelemetry, sendTelemetryEvent } from '.';
99
import { IDocumentManager } from '../common/application/types';
1010
import { isTestExecution } from '../common/constants';
1111
import '../common/extensions';
@@ -17,7 +17,8 @@ import { IImportTracker } from './types';
1717
Python has a fairly rich import statement. Originally the matching regexp was kept simple for
1818
performance worries, but it led to false-positives due to matching things like docstrings with
1919
phrases along the lines of "from the thing" or "import the thing". To minimize false-positives the
20-
regexp does its best to validate the structure of the import line. This leads to us supporting:
20+
regexp does its best to validate the structure of the import line _within reason_. This leads to
21+
us supporting the following (where `pkg` represents what we are actually capturing for telemetry):
2122
2223
- `from pkg import _`
2324
- `from pkg import _, _`
@@ -26,18 +27,14 @@ regexp does its best to validate the structure of the import line. This leads to
2627
- `import pkg, pkg`
2728
- `import pkg as _`
2829
29-
We can rely on the fact that the use of the `from` and `import` keywords from the start of a line are
30-
only usable for imports in valid code (`from` can also be used when raising an exception, but `raise`
31-
would be the first keyword on a line in that instance). We also get to rely on the fact that we only
32-
care about the top-level package, keeping the regex extremely greedy. This should lead to the regex
33-
failing fast and having low performance overhead.
30+
Things we are ignoring the following for simplicity/performance:
31+
32+
- `from pkg import (...)` (this includes single-line and multi-line imports with parentheses)
33+
- `import pkg # ... and anything else with a trailing comment.`
34+
- Non-standard whitespace separators within the import statement (i.e. more than a single space, tabs)
3435
35-
We can also ignore multi-line/parenthesized imports for simplicity since we don't' need 100% accuracy,
36-
just enough to be able to tell what packages user's rely on to make sure we are covering our bases
37-
in terms of support. This allows us to anchor the start and end of the regexp and not try to handle the
38-
parentheses case which adds a lot more optional parts to the regexp.
3936
*/
40-
const ImportRegEx = /^\s*(from\s+(?<fromImport>\w+)(?:\.\w+)*\s+import\s+\w+(?:\s+as\s+\w+|(?:\s*,\s*\w+)+(?:\s*,)?)?|import\s+(?<importImport>(?:\w+(?:\s*,\s*)?)+)(?:\s+as\s+\w+)?)\s*(#.*)?$/;
37+
const ImportRegEx = /^\s*(from (?<fromImport>\w+)(?:\.\w+)* import \w+(?:, \w+)*(?: as \w+)?|import (?<importImport>\w+(?:, \w+)*)(?: as \w+)?)$/;
4138
const MAX_DOCUMENT_LINES = 1000;
4239

4340
// Capture isTestExecution on module load so that a test can turn it off and still
@@ -100,6 +97,7 @@ export class ImportTracker implements IImportTracker {
10097
}
10198
}
10299

100+
@captureTelemetry(EventName.HASHED_PACKAGE_PERF)
103101
private checkDocument(document: TextDocument) {
104102
this.pendingDocs.delete(document.fileName);
105103
const lines = this.getDocumentLines(document);

src/client/telemetry/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ export interface IEventNamePropertyMapping {
287287
[EventName.GO_TO_OBJECT_DEFINITION]: never | undefined;
288288
[EventName.HOVER_DEFINITION]: never | undefined;
289289
[EventName.HASHED_PACKAGE_NAME]: { hashedName: string };
290+
[EventName.HASHED_PACKAGE_PERF]: never | undefined;
290291
[EventName.LINTER_NOT_INSTALLED_PROMPT]: LinterInstallPromptTelemetry;
291292
[EventName.PYTHON_INSTALL_PACKAGE]: { installer: string };
292293
[EventName.LINTING]: LintingTelemetry;

src/test/application/diagnostics/checks/updateTestSettings.unit.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ suite('Application Diagnostics - Check Test Settings', () => {
2525
let appEnv: IApplicationEnvironment;
2626
let storage: IPersistentState<string[]>;
2727
let workspace: IWorkspaceService;
28-
const sandbox = sinon.sandbox.create();
28+
const sandbox = sinon.createSandbox();
2929
setup(() => {
3030
fs = mock(FileSystem);
3131
appEnv = mock(ApplicationEnvironment);

src/test/telemetry/importTracker.unit.test.ts

Lines changed: 34 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,31 @@ suite('Import Tracker', () => {
2121
let documentManager: TypeMoq.IMock<IDocumentManager>;
2222
let openedEventEmitter: EventEmitter<TextDocument>;
2323
let savedEventEmitter: EventEmitter<TextDocument>;
24-
const pandasHash = hashJs.sha256().update('pandas').digest('hex');
25-
const elephasHash = hashJs.sha256().update('elephas').digest('hex');
26-
const kerasHash = hashJs.sha256().update('keras').digest('hex');
27-
const pysparkHash = hashJs.sha256().update('pyspark').digest('hex');
28-
const sparkdlHash = hashJs.sha256().update('sparkdl').digest('hex');
29-
const numpyHash = hashJs.sha256().update('numpy').digest('hex');
30-
const scipyHash = hashJs.sha256().update('scipy').digest('hex');
31-
const sklearnHash = hashJs.sha256().update('sklearn').digest('hex');
32-
const randomHash = hashJs.sha256().update('random').digest('hex');
24+
const pandasHash: string = hashJs.sha256().update('pandas').digest('hex');
25+
const elephasHash: string = hashJs.sha256().update('elephas').digest('hex');
26+
const kerasHash: string = hashJs.sha256().update('keras').digest('hex');
27+
const pysparkHash: string = hashJs.sha256().update('pyspark').digest('hex');
28+
const sparkdlHash: string = hashJs.sha256().update('sparkdl').digest('hex');
29+
const numpyHash: string = hashJs.sha256().update('numpy').digest('hex');
30+
const scipyHash: string = hashJs.sha256().update('scipy').digest('hex');
31+
const sklearnHash: string = hashJs.sha256().update('sklearn').digest('hex');
32+
const randomHash: string = hashJs.sha256().update('random').digest('hex');
3333

3434
class Reporter {
3535
public static eventNames: string[] = [];
3636
public static properties: Record<string, string>[] = [];
3737
public static measures: {}[] = [];
38+
39+
public static expectHashes(...hashes: string[]) {
40+
expect(Reporter.eventNames).to.contain(EventName.HASHED_PACKAGE_PERF);
41+
if (hashes.length > 0) {
42+
expect(Reporter.eventNames).to.contain(EventName.HASHED_PACKAGE_NAME);
43+
}
44+
45+
Reporter.properties.pop(); // HASHED_PACKAGE_PERF
46+
expect(Reporter.properties).to.deep.equal(hashes.map(hash => ({ hashedName: hash })));
47+
}
48+
3849
public sendTelemetryEvent(eventName: string, properties?: {}, measures?: {}) {
3950
Reporter.eventNames.push(eventName);
4051
Reporter.properties.push(properties!);
@@ -76,24 +87,21 @@ suite('Import Tracker', () => {
7687
test('Open document', () => {
7788
emitDocEvent('import pandas\r\n', openedEventEmitter);
7889

79-
expect(Reporter.eventNames).to.deep.equal([EventName.HASHED_PACKAGE_NAME]);
80-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]);
90+
Reporter.expectHashes(pandasHash);
8191
});
8292

8393
test('Already opened documents', async () => {
8494
const doc = createDocument('import pandas\r\n', 'foo.py', 1, TypeMoq.Times.atMost(100), true);
8595
documentManager.setup(d => d.textDocuments).returns(() => [doc.object]);
8696
await importTracker.activate();
8797

88-
expect(Reporter.eventNames).to.deep.equal([EventName.HASHED_PACKAGE_NAME]);
89-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]);
98+
Reporter.expectHashes(pandasHash);
9099
});
91100

92101
test('Save document', () => {
93102
emitDocEvent('import pandas\r\n', savedEventEmitter);
94103

95-
expect(Reporter.eventNames).to.deep.equal([EventName.HASHED_PACKAGE_NAME]);
96-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]);
104+
Reporter.expectHashes(pandasHash);
97105
});
98106

99107
test('from <pkg>._ import _, _', () => {
@@ -120,7 +128,7 @@ suite('Import Tracker', () => {
120128
model.set_weights(weights)`;
121129

122130
emitDocEvent(elephas, savedEventEmitter);
123-
expect(Reporter.properties).to.deep.equal([{hashedName: elephasHash}, {hashedName: kerasHash}]);
131+
Reporter.expectHashes(elephasHash, kerasHash);
124132
});
125133

126134
test('from <pkg>._ import _', () => {
@@ -142,7 +150,7 @@ suite('Import Tracker', () => {
142150
print("Training set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))`;
143151

144152
emitDocEvent(pyspark, savedEventEmitter);
145-
expect(Reporter.properties).to.deep.equal([{hashedName: pysparkHash}, {hashedName: sparkdlHash}]);
153+
Reporter.expectHashes(pysparkHash, sparkdlHash);
146154
});
147155

148156
test('import <pkg> as _', () => {
@@ -158,7 +166,7 @@ def simplify_ages(df):
158166
df.Age = categories
159167
return df`;
160168
emitDocEvent(code, savedEventEmitter);
161-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}, {hashedName: numpyHash}, {hashedName: randomHash}]);
169+
Reporter.expectHashes(pandasHash, numpyHash, randomHash);
162170
});
163171

164172
test('from <pkg> import _', () => {
@@ -172,14 +180,13 @@ x = np.array([r * np.cos(theta) for r in radius])
172180
y = np.array([r * np.sin(theta) for r in radius])
173181
z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`;
174182
emitDocEvent(code, savedEventEmitter);
175-
expect(Reporter.properties).to.deep.equal([{hashedName: scipyHash}]);
183+
Reporter.expectHashes(scipyHash);
176184
});
177185

178-
179186
test('from <pkg> import _ as _', () => {
180187
const code = `from pandas import DataFrame as df`;
181188
emitDocEvent(code, savedEventEmitter);
182-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]);
189+
Reporter.expectHashes(pandasHash);
183190
});
184191

185192
test('import <pkg1>, <pkg2>', () => {
@@ -193,27 +200,7 @@ x = np.array([r * np.cos(theta) for r in radius])
193200
y = np.array([r * np.sin(theta) for r in radius])
194201
z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`;
195202
emitDocEvent(code, savedEventEmitter);
196-
expect(Reporter.properties).to.deep.equal([{hashedName: sklearnHash}, {hashedName: pandasHash}]);
197-
});
198-
199-
/*test('from <pkg> import (_, _)', () => {
200-
const code = `from pandas import (DataFrame, Series)`;
201-
emitDocEvent(code, savedEventEmitter);
202-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]);
203-
});
204-
205-
test('from <pkg> import (_,', () => {
206-
const code = `
207-
from pandas import (DataFrame,
208-
Series)`;
209-
emitDocEvent(code, savedEventEmitter);
210-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]);
211-
});*/
212-
213-
test('import pkg # Comment', () => {
214-
const code = `import pandas # Because we wants it.`;
215-
emitDocEvent(code, savedEventEmitter);
216-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]);
203+
Reporter.expectHashes(sklearnHash, pandasHash);
217204
});
218205

219206
test('Import from within a function', () => {
@@ -227,36 +214,36 @@ x = np.array([r * np.cos(theta) for r in radius])
227214
y = np.array([r * np.sin(theta) for r in radius])
228215
z = np.array([drumhead_height(1, 1, r, theta, 0.5) for r in radius])`;
229216
emitDocEvent(code, savedEventEmitter);
230-
expect(Reporter.properties).to.deep.equal([{hashedName: sklearnHash}]);
217+
Reporter.expectHashes(sklearnHash);
231218
});
232219

233220
test('Do not send the same package twice', () => {
234221
const code = `
235222
import pandas
236223
import pandas`;
237224
emitDocEvent(code, savedEventEmitter);
238-
expect(Reporter.properties).to.deep.equal([{hashedName: pandasHash}]);
225+
Reporter.expectHashes(pandasHash);
239226
});
240227

241228
test('Ignore relative imports', () => {
242229
const code = 'from .pandas import not_real';
243230
emitDocEvent(code, savedEventEmitter);
244-
expect(Reporter.properties).to.deep.equal([]);
231+
Reporter.expectHashes();
245232
});
246233

247234
test('Ignore docstring for `from` imports', () => {
248235
const code = `"""
249236
from numpy import the random function
250237
"""`;
251238
emitDocEvent(code, savedEventEmitter);
252-
expect(Reporter.properties).to.deep.equal([]);
239+
Reporter.expectHashes();
253240
});
254241

255242
test('Ignore docstring for `import` imports', () => {
256243
const code = `"""
257244
import numpy for all the things
258245
"""`;
259246
emitDocEvent(code, savedEventEmitter);
260-
expect(Reporter.properties).to.deep.equal([]);
247+
Reporter.expectHashes();
261248
});
262249
});

0 commit comments

Comments
 (0)