@@ -386,6 +386,7 @@ def _mutate_merge_var_helper(vars_):
386386
387387
388388def mutate_merge_var_mix (child ):
389+ """Merges two variables into one, potentially merging node and edge vars."""
389390 vars_ = child .vars_in_graph
390391 rand_vars , merge_able_vars = _mutate_merge_var_helper (vars_ )
391392
@@ -399,6 +400,11 @@ def mutate_merge_var_mix(child):
399400
400401
401402def mutate_merge_var_sep (child ):
403+ """Merges two variables into one, won't merge node and edge vars.
404+
405+ Considers the node variables and edge variables separately.
406+ Depending on availability either merges 2 node variables or 2 edge variable.
407+ """
402408 node_vars = {n for n in child .nodes if isinstance (n , Variable )}
403409 rand_node_vars , merge_able_node_vars = _mutate_merge_var_helper (node_vars )
404410
@@ -436,6 +442,14 @@ def mutate_del_triple(child):
436442
437443
438444def mutate_expand_node (child , pb_en_out_link ):
445+ """Expands a random node by adding a new var-only triple to it.
446+
447+ Randomly selects a node. Then (depending on the probability pb_en_out_link)
448+ adds an outgoing or incoming triple with two new vars to it.
449+
450+ :arg pb_en_out_link: Probability to create an outgoing triple.
451+ :return: A child with the added outgoing/incoming triple.
452+ """
439453 # TODO: can maybe be improved by sparqling
440454 nodes = list (child .nodes )
441455 node = random .choice (nodes )
@@ -449,6 +463,13 @@ def mutate_expand_node(child, pb_en_out_link):
449463
450464
451465def mutate_add_edge (child ):
466+ """Adds an edge between 2 randomly selected nodes.
467+
468+ Randomly selects two nodes, then adds a new triple (n1, e, n2), where e is
469+ a new variable.
470+
471+ :return: A child with the added edge.
472+ """
452473 # TODO: can maybe be improved by sparqling
453474 nodes = list (child .nodes )
454475 if len (nodes ) < 2 :
@@ -460,6 +481,13 @@ def mutate_add_edge(child):
460481
461482
462483def mutate_increase_dist (child ):
484+ """Increases the distance between ?source and ?target by one hop.
485+
486+ Randomly adds a var only triple to the ?source or ?target var. Then swaps
487+ the new node with ?source/?target to increase the distance by one hop.
488+
489+ :return: A child with increased distance between ?source and ?target.
490+ """
463491 if not child .complete ():
464492 return child
465493 var_node = gen_random_var ()
@@ -477,6 +505,13 @@ def mutate_increase_dist(child):
477505
478506
479507def mutate_fix_var_filter (item_counts ):
508+ """Filters results for fix var mutation.
509+
510+ Excludes:
511+ - too long literals
512+ - URIs with encoding errors (real world!)
513+ - BNode results (they will not be fixed but stay SPARQL vars)
514+ """
480515 assert isinstance (item_counts , Counter )
481516 for i in list (item_counts .keys ()):
482517 if isinstance (i , Literal ):
@@ -514,21 +549,59 @@ def mutate_fix_var(
514549 timeout ,
515550 gtp_scores ,
516551 child ,
517- gtp_sample_n = config .MUTPB_FV_RGTP_SAMPLE_N ,
552+ gtp_sample_max_n = config .MUTPB_FV_RGTP_SAMPLE_N ,
518553 rand_var = None ,
519- sample_n = config .MUTPB_FV_SAMPLE_MAXN ,
554+ sample_max_n = config .MUTPB_FV_SAMPLE_MAXN ,
520555 limit = config .MUTPB_FV_QUERY_LIMIT ,
521556):
557+ """Finds possible fixations for a randomly selected variable of the pattern.
558+
559+ This is the a very important mutation of the gp learner, as it is the main
560+ source of actually gaining information from the SPARQL endpoint.
561+
562+ The outline of the mutation is as follows:
563+ - If not passed in, randomly selects a variable (rand_var) of the pattern
564+ (node or edge var, excluding ?source and ?target).
565+ - Randomly selects a subset of up to gtp_sample_max_n GTPs with
566+ probabilities according to their remaining gains. The number of GTPs
567+ picked is randomized (see below).
568+ - Issues SPARQL queries to find possible fixations for the selected variable
569+ under the previously selected GTPs subset. Counts the fixation's
570+ occurrences wrt. the GTPs and sorts the result descending by these counts.
571+ - Limits the result rows to deal with potential long-tails.
572+ - Filters the resulting rows with mutate_fix_var_filter.
573+ - From the limited, filtered result rows randomly selects up to sample_max_n
574+ candidate fixations with probabilities according to their counts.
575+ - For each candidate fixation returns a child in which rand_var is replaced
576+ with the candidate fixation.
577+
578+ The reasons for fixing rand_var based on a randomly sized subset of GTPs
579+ are efficiency and shadowing problems with common long-tails. Due to the
580+ later imposed limit (which is vital in real world use-cases),
581+ a few remaining GTPs that share more than `limit` potential fixations (so
582+ have a common long-tail) could otherwise hide solutions for other
583+ remaining GTPs. This can be the case if these common fixations have low
584+ fitness. By randomizing the subset size, we will eventually (and more
585+ likely) select other combinations of remaining GTPs.
586+
587+ :param gtp_sample_max_n: Maximum GTPs subset size to base fixations on.
588+ :param rand_var: If given uses this variable instead of a random one.
589+ :param sample_max_n: Maximum number of children.
590+ :param limit: SPARQL limit for the top-k result rows.
591+ :return: A list of children in which the selected variable is substituted
592+ with fixation candidates wrt. GTPs.
593+ """
522594 assert isinstance (child , GraphPattern )
523595 assert isinstance (gtp_scores , GTPScores )
524596
525597 # The further we get, the less gtps are remaining. Sampling too many (all)
526598 # of them might hurt as common substitutions (> limit ones) which are dead
527599 # ends could cover less common ones that could actually help
528- gtp_sample_n = min (gtp_sample_n , int (gtp_scores .remaining_gain ))
529- gtp_sample_n = random .randint (1 , gtp_sample_n )
600+ gtp_sample_max_n = min (gtp_sample_max_n , int (gtp_scores .remaining_gain ))
601+ gtp_sample_max_n = random .randint (1 , gtp_sample_max_n )
530602
531- ground_truth_pairs = gtp_scores .remaining_gain_sample_gtps (n = gtp_sample_n )
603+ ground_truth_pairs = gtp_scores .remaining_gain_sample_gtps (
604+ max_n = gtp_sample_max_n )
532605 rand_vars = child .vars_in_graph - {SOURCE_VAR , TARGET_VAR }
533606 if len (rand_vars ) < 1 :
534607 return [child ]
@@ -549,13 +622,13 @@ def mutate_fix_var(
549622 return [child ]
550623 # randomly pick n of the substitutions with a prob ~ to their counts
551624 items , counts = zip (* substitution_counts .most_common ())
552- substs = sample_from_list (items , counts , sample_n )
625+ substs = sample_from_list (items , counts , sample_max_n )
553626 logger .info (
554627 'fixed variable %s in %sto:\n %s\n <%d out of:\n %s\n ' ,
555628 rand_var .n3 (),
556629 child ,
557630 '\n ' .join ([subst .n3 () for subst in substs ]),
558- sample_n ,
631+ sample_max_n ,
559632 '\n ' .join ([' %d: %s' % (c , v .n3 ())
560633 for v , c in substitution_counts .most_common ()]),
561634 )
0 commit comments