55from __future__ import print_function
66from __future__ import unicode_literals
77
8+ from copy import deepcopy
89from itertools import combinations
910from itertools import combinations_with_replacement
1011from itertools import permutations
1112from itertools import product
13+ import sys
1214
1315from rdflib import Variable
1416from scipy .special import binom
17+ from scipy .misc import comb
1518
1619from logging_config import logging
1720from graph_pattern import SOURCE_VAR
2427
2528
2629DEBUG = False
30+ HOLE = sys .maxint # placeholder for holes in partial patterns
2731
2832# debug logging in this module is actually quite expensive (> 30 % of time). In
2933# case it's undesired the following removes that overhead.
@@ -34,6 +38,173 @@ def quick_skip_debug_log(*args, **kwds):
3438 logger .debug = quick_skip_debug_log
3539
3640
41+ def numerical_patterns (
42+ length ,
43+ _partial_pattern = None ,
44+ _pos = None ,
45+ _var = 1 ,
46+ ):
47+ """Numerical pattern generator.
48+
49+ A pattern is a tuple of 3 tuples of variables, so for example the following
50+ is a pattern of length 2:
51+ ((?source, ?v3, ?target), (?target, ?v3, ?v4))
52+
53+ For brevity, we can write the same as:
54+ 'acb bcd' or numerical as '132 231'
55+
56+ In the short version we could map ?source to 'a' or '1', ?target to 'b' or
57+ '2' and the other variables to the following letters / numbers.
58+
59+ During generation we should take care that we don't generate a whole lot of
60+ unnecessary duplicates (so patterns that are obviously invalid or isomorphic
61+ to previous ones).
62+
63+ A pattern is valid if:
64+ - its triples are sorted
65+ NO: 221 112 --> YES: 112 221
66+ - its triples are pairwise distinct
67+ NO: 112 112
68+ - its triples are pairwise connected
69+ NO: 123 456
70+ YES: 123 345
71+ YES: 123 132
72+ - the used variables don't skip a variable
73+ NO: 124 456 --> YES: 123 345
74+ - variables aren't unnecessary high
75+ NO: 124 334 --> YES: 123 443
76+ NO: 421 534 --> YES: 123 451
77+ YES: 312 411
78+ - it uses between 2 (source and target) and 2n + 1 vars (3 + 2 + 2 + ...)
79+
80+ """
81+ if not _partial_pattern :
82+ _partial_pattern = [[HOLE , HOLE , HOLE ] for _ in range (length )]
83+ _pos = (0 , 0 )
84+
85+ i , j = _pos
86+ _partial_pattern = deepcopy (_partial_pattern )
87+ _partial_pattern [i ][j ] = _var
88+
89+ if i >= 1 and _partial_pattern [i - 1 ] >= _partial_pattern [i ]:
90+ # current triple must be larger than previous one for sorting and to
91+ # exclude multiple equivalent triples
92+ return
93+
94+ if i >= 1 and j == 2 :
95+ # we just completed a triple, check that it's connected
96+ t = _partial_pattern [i ]
97+ for pt in _partial_pattern [:i ]:
98+ if t [0 ] in pt or t [1 ] in pt or t [2 ] in pt :
99+ break
100+ else :
101+ # we're not connected, early terminate this
102+ # This is safe as a later triple can't reconnect us anymore without
103+ # an isomorphic, lower enumeration that would've been encountered
104+ # before:
105+ # say we have
106+ # abc xyz uvw
107+ # with xyz not being connected yet and uvw or any later part
108+ # connecting xyz back to abc. We can just use a breadth first search
109+ # from abc via those connecting triples and re-label all encountered
110+ # vars by breadth first search encountering. That re-labeling is
111+ # guaranteed to forward connect and it will generate a smaller
112+ # labelling than the current one.
113+ return
114+
115+ if i >= length - 1 and j >= 2 :
116+ # we're at the end of the pattern
117+ yield _partial_pattern
118+ else :
119+ # advance to next position
120+ j += 1
121+ if j > 2 :
122+ j = 0
123+ i += 1
124+
125+ flat_pp = [v for t in _partial_pattern for v in t ]
126+ prev_vars = [v for v in flat_pp ][:3 * i + j ]
127+ prev_max_var = max ([v for v in prev_vars if v != HOLE ])
128+ _star_var = 1
129+ # if i > 0:
130+ # # doesn't seem to hold :(
131+ # _star_var = _partial_pattern[i - 1][j]
132+ _end_var = min (
133+ prev_max_var + 1 , # can't skip a var
134+ # 2*length + 1, # can't exceed max total number of vars (induced)
135+ 3 + 2 * i , # vars in triple i can't exceed this, otherwise not sorted
136+ )
137+ for v in range (_star_var , _end_var + 1 ):
138+ for pattern in numerical_patterns (
139+ length ,
140+ _partial_pattern = _partial_pattern ,
141+ _pos = (i , j ),
142+ _var = v
143+ ):
144+ yield pattern
145+
146+
147+ def patterns (
148+ length ,
149+ exclude_isomorphic = True ,
150+ count_candidates_only = False ,
151+ ):
152+ """Takes a numerical pattern and generates actual patterns from it."""
153+ assert not count_candidates_only or not exclude_isomorphic , \
154+ 'count_candidates_only cannot be used with isomorphism check'
155+
156+ canonicalized_patterns = {}
157+
158+ pid = - 1
159+ for c , num_pat in enumerate (numerical_patterns (length )):
160+ numbers = sorted (set ([v for t in num_pat for v in t ]))
161+ # var_map = {i: '?v%d' % i for i in numbers}
162+ # pattern = GraphPattern(
163+ # tuple([tuple([var_map[i] for i in t]) for t in numerical_repr]))
164+ if count_candidates_only :
165+ l = len (numbers )
166+ perms = l * (l - 1 )
167+ pid += perms
168+ # yield pid, None # way slower, rather show progress from here:
169+ if c % 100000 == 0 :
170+ logger .info (
171+ 'pattern id: %d, vars: %d, permutations: %d' ,
172+ pid , l , perms
173+ )
174+ continue
175+
176+ for s , t in permutations (numbers , 2 ):
177+ pid += 1
178+ leftover_numbers = [n for n in numbers if n != s and n != t ]
179+ var_map = {n : Variable ('v%d' % i )
180+ for i , n in enumerate (leftover_numbers )}
181+ var_map [s ] = SOURCE_VAR
182+ var_map [t ] = TARGET_VAR
183+ gp = GraphPattern (
184+ tuple ([tuple ([var_map [i ] for i in trip ]) for trip in num_pat ]))
185+
186+ # exclude patterns which are isomorphic to already generated ones
187+ if exclude_isomorphic :
188+ cgp = canonicalize (gp )
189+ if cgp in canonicalized_patterns :
190+ igp = canonicalized_patterns [cgp ]
191+ igp_numpat , igp_s , igp_t , igp_gp = igp
192+ logger .debug (
193+ 'excluded isomorphic %s with ?s=%d, ?t=%d:\n '
194+ 'isomorphic to %s with ?s=%d, ?t=%d:\n '
195+ '%sand\n %s' ,
196+ num_pat , s , t ,
197+ igp_numpat , igp_s , igp_t ,
198+ gp , igp_gp ,
199+ )
200+ continue
201+ else :
202+ canonicalized_patterns [cgp ] = (num_pat , s , t , gp )
203+ gp = cgp
204+ yield pid , gp
205+ yield pid + 1 , None
206+
207+
37208def pattern_generator (
38209 length ,
39210 loops = True ,
@@ -119,24 +290,31 @@ def pattern_generator(
119290
120291
121292def main ():
122- length = 3
123- # 3: 47478 (pcon, nej) of 6666891
124- # 4:
125- # 5:
293+ length = 1
294+ # len | pcon | nej | pcon, nej | candidates | candidates |
295+ # | | | (canonical) | (old method) | (numerical) |
296+ # ----+------+-----+--------------+----------------+-------------+
297+ # 1 | 8 | 12 | 12 | 27 | 12 |
298+ # 2 | 146 | 469 | 693 | 7750 | 1314 |
299+ # 3 | | | 47478 | 6666891 | 151534 |
300+ # 4 | | | | 11671285626 | 20884300 |
301+ # 5 | | | | 34549552710596 | 3461471628 |
126302
127303 gen_patterns = []
128- for n , (i , pattern ) in enumerate (pattern_generator (length )):
304+ i = 0
305+ for n , (i , pattern ) in enumerate (patterns (length , False , True )):
129306 print ('%d: Pattern id %d: %s' % (n , i , pattern ))
130307 gen_patterns .append ((i , pattern ))
131- patterns = set (gp for pid , gp in gen_patterns [:- 1 ])
308+ print (i )
309+ _patterns = set (gp for pid , gp in gen_patterns [:- 1 ])
132310
133311 # testing flipped edges
134- for gp in patterns :
312+ for gp in _patterns :
135313 for i in range (length ):
136314 mod_gp = gp .flip_edge (i )
137315 # can happen that flipped edge was there already
138316 if len (mod_gp ) == length :
139- assert canonicalize (mod_gp ) in patterns
317+ assert canonicalize (mod_gp ) in _patterns
140318
141319
142320if __name__ == '__main__' :
0 commit comments