Skip to content

Commit 130225f

Browse files
authored
Merge pull request threatgrid#56 from quoll/outer-products-55
Adds outer products and optimizes large query analysis
2 parents d63f40b + ba797db commit 130225f

File tree

4 files changed

+160
-31
lines changed

4 files changed

+160
-31
lines changed

src/naga/lang/basic.clj

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060

6161
;; This does not include all legal characters.
6262
;; Consider some others in future, especially >
63-
(def ns-word (many1 (choice (letter) (char \_) (char \-) (char \:))))
63+
(def ns-word (many1 (choice (letter) (digit) (char \_) (char \-) (char \:))))
6464

6565
(def word (many1 (letter)))
6666

@@ -97,7 +97,7 @@
9797
;; variables start with a capital. Internally they start with ?
9898
(defparser variable []
9999
(let->> [f (upper-case-letter)
100-
r (many (letter))]
100+
r (many (choice (letter) (digit) (char \_) (char \-)))]
101101
(always (symbol (apply str "?" (Character/toLowerCase f) r)))))
102102

103103
(defn build-keyword
@@ -106,7 +106,8 @@
106106
(let [[kns kname :as w] (str/split wrd #":")
107107
parts (count w)]
108108
;; use cond without a default to return nil
109-
(cond (= 2 parts) (cond (empty? kns) (keyword kname)
109+
(cond (Character/isDigit (first wrd)) nil
110+
(= 2 parts) (cond (empty? kns) (keyword kname)
110111
(seq kname) (keyword kns kname))
111112
(= 1 parts) (if-not (str/ends-with? wrd ":")
112113
(keyword kns)))))

src/naga/storage/memory/core.clj

Lines changed: 70 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -23,33 +23,49 @@
2323
s :- [s/Any]]
2424
(remove (partial = e) s))
2525

26+
(s/defn find-start :- EPVPattern
27+
"Returns the first pattern with the smallest count"
28+
[pattern-counts :- {EPVPattern s/Num}
29+
patterns :- [EPVPattern]]
30+
(let [local-counts (select-keys pattern-counts patterns)
31+
low-count (reduce min (map second local-counts))
32+
pattern (ffirst (filter #(= low-count (second %)) local-counts))]
33+
;; must use first/filter/= instead of some/#{pattern} because
34+
;; patterns contains metadata and pattern does not
35+
(first (filter (partial = pattern) patterns))))
2636

2737
(s/defn paths :- [[EPVPattern]]
2838
"Returns a seq of all paths through the constraints. A path is defined
2939
by new patterns containing at least one variable common to the patterns
30-
that appeared before it. This prevents cross products in a join."
31-
([patterns :- [EPVPattern]]
32-
(let [all-paths (paths #{} patterns)]
33-
(assert (every? (partial = (count patterns)) (map count all-paths))
34-
(str "No valid paths through: " (vec patterns)))
35-
all-paths))
36-
([bound :- #{Symbol}
37-
patterns :- [EPVPattern]]
38-
(apply concat
39-
(keep ;; discard paths that can't proceed (they return nil)
40-
(fn [p]
41-
(let [b (get-vars p)]
42-
;; only proceed when the pattern matches what has been bound
43-
(if (or (empty? bound) (seq (set/intersection b bound)))
44-
;; pattern can be added to the path, get the other patterns
45-
(let [remaining (without p patterns)]
46-
;; if there are more patterns to add to the path, recurse
47-
(if (seq remaining)
48-
(map (partial cons p)
49-
(seq
50-
(paths (into bound b) remaining)))
51-
[[p]])))))
52-
patterns))))
40+
that appeared before it. Patterns must form a group."
41+
([patterns :- [EPVPattern]
42+
pattern-counts :- {EPVPattern s/Num}]
43+
(s/letfn [(remaining-paths :- [[EPVPattern]]
44+
[bound :- #{Symbol}
45+
rpatterns :- [EPVPattern]]
46+
(if (seq rpatterns)
47+
(apply concat
48+
(keep ;; discard paths that can't proceed (they return nil)
49+
(fn [p]
50+
(let [b (get-vars p)]
51+
;; only proceed when the pattern matches what has been bound
52+
(if (or (empty? bound) (seq (set/intersection b bound)))
53+
;; pattern can be added to the path, get the other patterns
54+
(let [remaining (without p rpatterns)]
55+
;; if there are more patterns to add to the path, recurse
56+
(if (seq remaining)
57+
(map (partial cons p)
58+
(seq
59+
(remaining-paths (into bound b) remaining)))
60+
[[p]])))))
61+
rpatterns))
62+
[[]]))]
63+
(let [start (find-start pattern-counts patterns)
64+
all-paths (map (partial cons start)
65+
(remaining-paths (get-vars start) (without start patterns)))]
66+
(assert (every? (partial = (count patterns)) (map count all-paths))
67+
(str "No valid paths through: " (vec patterns)))
68+
all-paths))))
5369

5470

5571
(def epv-pattern? vector?)
@@ -75,18 +91,44 @@
7591
(recur (into plan nxt-filters) bound patterns remaining-filters)
7692
(recur (conj plan np) (into bound (get-vars np)) rp filters)))))))
7793

94+
(s/defn first-group :- [(s/one [Pattern] "group") (s/one [Pattern] "remainder")]
95+
"Finds a group from a sequence of patterns. A group is defined by every pattern
96+
sharing at least one var with at least one other pattern. Returns a pair.
97+
The first returned element is the Patterns in the group, the second is what was left over."
98+
[[fp & rp] :- [Pattern]]
99+
(letfn [;; Define a reduction step.
100+
;; Accumulates a triple of: known vars; patterns that are part of the group;
101+
;; patterns that are not in the group. Each step looks at a pattern for
102+
;; inclusion or exclusion
103+
(step [[vs included excluded] next-pattern]
104+
(let [new-vars (get-vars next-pattern)]
105+
(if (seq (set/intersection vs new-vars))
106+
[(into vs new-vars) (conj included next-pattern) excluded]
107+
[vs included (conj excluded next-pattern)])))
108+
;; apply the reduction steps, with a given set of known vars, and
109+
;; included patterns. Previously excluded patterns are being scanned
110+
;; again using the new known vars.
111+
(groups [[v i e]] (reduce step [v i []] e))]
112+
;; scan for everything that matches the first pattern, and then iterate until
113+
;; everything that matches the resulting patterns has also been found.
114+
;; Drop the set of vars before returning.
115+
(rest (u/fixpoint groups [(get-vars fp) [fp] rp]))))
116+
78117
(s/defn min-join-path :- [EPVPattern]
79118
"Calculates a plan based on no outer joins (a cross product), and minimized joins.
80119
A plan is the order in which to evaluate constraints and join them to the accumulated
81120
evaluated data. If it is not possible to create a path without a cross product,
82121
then return a plan of the patterns in the provided order."
83122
[patterns :- [Pattern]
84123
count-map :- {EPVPattern s/Num}]
85-
(or
86-
(->> (paths patterns)
87-
(sort-by (partial mapv count-map))
88-
first)
89-
patterns)) ;; TODO: longest paths with minimized cross products
124+
(loop [[grp rmdr] (first-group patterns) ordered []]
125+
(let [all-ordered (->> (paths grp count-map)
126+
(sort-by (partial mapv count-map))
127+
first
128+
(concat ordered))] ;; TODO: order groups, rather than concat as found
129+
(if (empty? rmdr)
130+
all-ordered
131+
(recur (first-group rmdr) all-ordered)))))
90132

91133
(s/defn user-plan :- [EPVPattern]
92134
"Returns the original path specified by the user"

src/naga/util.clj

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,23 @@
2727
symbol
2828
find-ns
2929
(ns-resolve snm))))
30+
31+
(s/defn divide :- [[s/Any] [s/Any]]
32+
"Takes a predicate and a sequence and returns 2 sequences.
33+
The first is where the predicate returns true, and the second
34+
is where the predicate returns false. Note that a nil value
35+
will not be returned in either sequence, regardless of the
36+
value returned by the predicate."
37+
[p
38+
s :- [s/Any]]
39+
(let [d (map (fn [x] (if (p x) [x nil] [nil x])) s)]
40+
[(keep first d) (keep second d)]))
41+
42+
(defn fixpoint
43+
"Applies the function f to the value a. The function is then,
44+
and applied to the result, over and over, until the result does not change.
45+
Returns the final result.
46+
Note: If the function has no fixpoint, then runs forever."
47+
[f a]
48+
(let [s (iterate f a)]
49+
(some identity (map #(#{%1} %2) s (rest s)))))

test/naga/storage/test_memory.clj

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,69 @@
111111
(is (= #{[:pa :t]
112112
[:pb :u]
113113
[:pz :l]} r1))))
114+
115+
(deftest test-query-path
116+
(let [simple-p '[[?a :a :b] [?b :c :d]]
117+
simple-cm '{[?a :a :b] 1, [?b :c :d] 1}
118+
[g] (first-group simple-p)
119+
p (min-join-path simple-p simple-cm)
120+
simple-p2 '[[?a :a :b] [?b :c :d] [?c :e ?b] [?a :c :d]]
121+
simple-cm2 '{[?a :a :b] 1, [?b :c :d] 2, [?c :e ?b] 1, [?a :c :d] 1}
122+
[g2] (first-group simple-p2)
123+
p2 (min-join-path simple-p2 simple-cm2)
124+
patterns '[[?a :a :b]
125+
[?b :c ?d]
126+
[?d :d ?e]
127+
[?d :e ?f]
128+
[?f :f ?a]
129+
[?f :g ?g]
130+
[?g :v1 ?v1]
131+
[?g :v2 ?v2]
132+
[?h :v1 ?v1]
133+
[?h :v2 ?v2]
134+
[?i :i ?h]
135+
[?other :id "id"]]
136+
count-map '{[?a :a :b] 1
137+
[?b :c ?d] 2
138+
[?d :d ?e] 3
139+
[?d :e ?f] 3
140+
[?f :f ?a] 3
141+
[?f :g ?g] 5
142+
[?g :v1 ?v1] 3
143+
[?g :v2 ?v2] 4
144+
[?h :v1 ?v1] 5
145+
[?h :v2 ?v2] 6
146+
[?i :i ?h] 7
147+
[?other :id "id"] 1}
148+
[group] (first-group patterns)
149+
path (min-join-path patterns count-map)]
150+
151+
(is (= '[[?a :a :b]] g))
152+
(is (= '[[?a :a :b] [?b :c :d]] p))
153+
154+
(is (= '[[?a :a :b] [?a :c :d]] g2))
155+
(is (= '[[?a :a :b] [?a :c :d] [?c :e ?b] [?b :c :d]] p2))
156+
157+
(is (= '[[?a :a :b]
158+
[?f :f ?a]
159+
[?f :g ?g]
160+
[?g :v1 ?v1]
161+
[?g :v2 ?v2]
162+
[?h :v1 ?v1]
163+
[?h :v2 ?v2]
164+
[?i :i ?h]
165+
[?d :e ?f]
166+
[?b :c ?d]
167+
[?d :d ?e]] group))
168+
(is (= '[[?a :a :b]
169+
[?f :f ?a]
170+
[?d :e ?f]
171+
[?b :c ?d]
172+
[?d :d ?e]
173+
[?f :g ?g]
174+
[?g :v1 ?v1]
175+
[?g :v2 ?v2]
176+
[?h :v1 ?v1]
177+
[?h :v2 ?v2]
178+
[?i :i ?h]
179+
[?other :id "id"]] path))))

0 commit comments

Comments
 (0)