Skip to content

Commit e2146da

Browse files
author
Gal Ben David
committed
introduce new function called scan_from_url
1 parent 8d8c0b9 commit e2146da

File tree

8 files changed

+107
-17
lines changed

8 files changed

+107
-17
lines changed

Cargo.toml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
11
[package]
22
name = "pyrepscan"
3-
version = "0.7.4"
3+
version = "0.8.0"
44
authors = ["Gal Ben David <gal@intsights.com>"]
55
edition = "2018"
66
description = "A Git Repository Secrets Scanner written in Rust"
77
readme = "README.md"
88
repository = "https://github.com/intsights/pyrepscan"
99
homepage = "https://github.com/intsights/pyrepscan"
1010
license = "MIT"
11-
keywords = ["adblock", "ads", "adblocker", "rust", "brave", "abp", "pyo3"]
11+
keywords = [
12+
"git",
13+
"secrets",
14+
"scanner",
15+
"rust",
16+
"pyo3",
17+
]
1218

1319
[package.metadata.maturin]
1420
requires-python = ">=3.6"
@@ -33,13 +39,13 @@ regex = "1"
3339
rayon = "1.5"
3440
chrono = "0.4"
3541
parking_lot = "0.11"
42+
num_cpus = "1"
3643

3744
[dependencies.git2]
3845
version = "0.13"
39-
default-features = false
4046

4147
[dependencies.pyo3]
42-
version = "0.13.1"
48+
version = "0.13.2"
4349
features = ["extension-module"]
4450

4551
[profile.release]

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,22 @@ A sample result would look like this:
142142
```
143143

144144

145+
```python
146+
def scan_from_url(
147+
self,
148+
url: str,
149+
repository_path: str,
150+
branch_glob_pattern: typing.Optional[str],
151+
from_timestamp: typing.Optional[int],
152+
) -> typing.List[typing.Dict[str, str]]
153+
```
154+
The same as `scan` function but also clones a repository from a given URL into the provided repository path.
155+
- `url` - URL of a git repository.
156+
- `repository_path` - The path to clone the repository to
157+
- `branch_glob_pattern` - A glob pattern to filter branches for the scan. If None is sent, defaults to `*`.
158+
- `from_timestamp` - A UTC timestamp (Int) that only commits that were created after this timestamp would be included in the scan. If None is sent, defaults to `0`.
159+
160+
145161
```python
146162
def get_file_content(
147163
self,

pyproject.toml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ strip = true
1414

1515
[tool.poetry]
1616
name = "pyrepscan"
17-
version = "0.7.4"
17+
version = "0.8.0"
1818
authors = ["Gal Ben David <gal@intsights.com>"]
1919
description = "A Git Repository Secrets Scanner written in Rust"
2020
readme = "README.md"
@@ -49,3 +49,13 @@ gitpython = "*"
4949
wheel = "*"
5050
pytest-runner = "*"
5151
maturin = "*"
52+
53+
[tool.pytest.ini_options]
54+
minversion = "6.0"
55+
addopts = [
56+
"--tb=native",
57+
"--pythonwarnings=all",
58+
]
59+
testpaths = [
60+
"tests",
61+
]

pyrepscan/pyrepscan.pyi

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ class GitRepositoryScanner:
3737
from_timestamp: typing.Optional[int],
3838
) -> typing.List[typing.Dict[str, str]]: ...
3939

40+
def scan_from_url(
41+
self,
42+
url: str,
43+
repository_path: str,
44+
branch_glob_pattern: typing.Optional[str],
45+
from_timestamp: typing.Optional[int],
46+
) -> typing.List[typing.Dict[str, str]]: ...
47+
4048
def get_file_content(
4149
self,
4250
repository_path: str,

setup.cfg

Lines changed: 0 additions & 5 deletions
This file was deleted.

src/git_repository_scanner.rs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ use crate::rules_manager;
22

33
use chrono::prelude::*;
44
use git2::{Oid, Repository, Delta};
5+
use git2::Error;
56
use parking_lot::Mutex;
67
use rayon::prelude::*;
78
use std::collections::HashMap;
8-
use std::sync::Arc;
9-
use git2::Error;
109
use std::path::Path;
10+
use std::sync::Arc;
1111

1212
fn scan_commit_oid(
1313
git_repo: &Repository,
@@ -152,13 +152,19 @@ pub fn scan_repository(
152152
}
153153
}
154154
}
155+
156+
let chunk_size = (oids.len() as f64 / (num_cpus::get() * 5) as f64).ceil() as usize;
155157
if !oids.is_empty() {
156-
let chunk_size = (oids.len() as f64 / 100.0).ceil();
157-
oids.par_chunks(chunk_size as usize).for_each_init(
158-
|| Repository::open(repository_path).unwrap(),
159-
|git_repo, oids| {
158+
oids.par_chunks(chunk_size).for_each(
159+
|oids| {
160+
let git_repo = Repository::open(repository_path).unwrap();
160161
for oid in oids {
161-
scan_commit_oid(git_repo, oid, rules_manager, output_matches.clone()).unwrap_or(());
162+
scan_commit_oid(
163+
&git_repo,
164+
oid,
165+
rules_manager,
166+
output_matches.clone()
167+
).unwrap_or(());
162168
}
163169
},
164170
);

src/lib.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
mod git_repository_scanner;
22
mod rules_manager;
33

4+
use git2::Repository;
45
use parking_lot::Mutex;
56
use pyo3::exceptions;
67
use pyo3::prelude::*;
@@ -213,6 +214,52 @@ impl GitRepositoryScanner {
213214
Ok(matches.lock().to_object(py))
214215
}
215216
}
217+
218+
/// Scan a git repository for secrets. Rules shuld be loaded before calling this function.
219+
///
220+
/// input:
221+
/// url: str -> URL of a git repository
222+
/// repository_path: str -> The path to clone the repository to
223+
/// branch_glob_pattern: str -> A blob pattern to match against the git branches names.
224+
/// Only matched branches will be scanned.
225+
/// from_timestamp: int = 0 -> Unix epoch timestamp to start the scan from.
226+
///
227+
/// returns:
228+
/// list[dict] -> List of matches
229+
///
230+
/// example:
231+
/// grs.scan_from_url(
232+
/// url="https://github.com/rust-lang/git2-rs",
233+
/// repository_path="/path/to/repository",
234+
/// branch_glob_pattern="*",
235+
/// )
236+
#[text_signature = "(url, repository_path, branch_glob_pattern, from_timestamp, /)"]
237+
fn scan_from_url(
238+
&self,
239+
py: Python,
240+
url: &str,
241+
repository_path: &str,
242+
branch_glob_pattern: Option<&str>,
243+
from_timestamp: Option<i64>,
244+
) -> PyResult<Py<PyAny>> {
245+
let matches = Arc::new(Mutex::new(Vec::<HashMap<&str, String>>::with_capacity(10000)));
246+
247+
if let Err(error) = Repository::clone(url, repository_path) {
248+
return Err(exceptions::PyRuntimeError::new_err(error.to_string()));
249+
};
250+
251+
if let Err(error) = git_repository_scanner::scan_repository(
252+
repository_path,
253+
branch_glob_pattern.unwrap_or("*"),
254+
from_timestamp.unwrap_or(0),
255+
&self.rules_manager,
256+
matches.clone(),
257+
) {
258+
Err(exceptions::PyRuntimeError::new_err(error.to_string()))
259+
} else {
260+
Ok(matches.lock().to_object(py))
261+
}
262+
}
216263
}
217264

218265
/// PyRepScan is a Python library written in Rust. The library prodives an API to scan git repositories

tests/test_git_repository_scanner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ def setUp(
1313
self,
1414
):
1515
self.tmpdir = tempfile.TemporaryDirectory()
16+
self.addCleanup(self.tmpdir.cleanup)
17+
1618
bare_repo = git.Repo.init(
1719
path=self.tmpdir.name,
1820
)

0 commit comments

Comments
 (0)