@@ -364,12 +364,7 @@ def _mutate_merge_var_helper(vars_):
364364
365365
366366def mutate_merge_var_mix (child ):
367- """Merges two variables into one.
368-
369- Considers both node variables and edge variables together.
370- It is possible to merge an edge and a node too.
371- Randomly chooses a variable to replace and a variable to merge into.
372- """
367+ """Merges two variables into one, potentially merging node and edge vars."""
373368 vars_ = child .vars_in_graph
374369 rand_vars , merge_able_vars = _mutate_merge_var_helper (vars_ )
375370
@@ -383,11 +378,10 @@ def mutate_merge_var_mix(child):
383378
384379
385380def mutate_merge_var_sep (child ):
386- """Merges two variables into one.
381+ """Merges two variables into one, won't merge node and edge vars .
387382
388383 Considers the node variables and edge variables separately.
389- Either merges 2 node variables or 2 edge variable, depending on a random
390- choice.Randomly chooses a variable to replace and a variable to merge into.
384+ Depending on availability either merges 2 node variables or 2 edge variable.
391385 """
392386 node_vars = {n for n in child .nodes if isinstance (n , Variable )}
393387 rand_node_vars , merge_able_node_vars = _mutate_merge_var_helper (node_vars )
@@ -426,12 +420,13 @@ def mutate_del_triple(child):
426420
427421
428422def mutate_expand_node (child , pb_en_out_link ):
429- """Expands a random node of the pattern by adding a new triple to it.
423+ """Expands a random node by adding a new var-only triple to it.
430424
431- The variables to be attached to this node, to form a triple, are chosen
432- randomly.Depending on the probability, makes it an outgoing edge or an
433- incoming edge.
434- :return: The modified child, with the added triple.
425+ Randomly selects a node. Then (depending on the probability pb_en_out_link)
426+ adds an outgoing or incoming triple with two new vars to it.
427+
428+ :arg pb_en_out_link: Probability to create an outgoing triple.
429+ :return: A child with the added outgoing/incoming triple.
435430 """
436431 # TODO: can maybe be improved by sparqling
437432 nodes = list (child .nodes )
@@ -446,10 +441,12 @@ def mutate_expand_node(child, pb_en_out_link):
446441
447442
448443def mutate_add_edge (child ):
449- """Chooses any 2 nodes from the pattern, and adds an edge between them.
444+ """Adds an edge between 2 randomly selected nodes.
445+
446+ Randomly selects two nodes, then adds a new triple (n1, e, n2), where e is
447+ a new variable.
450448
451- The edge is labeled with a new randomly chosen variable.
452- :return: Modified child, with the new edge
449+ :return: A child with the added edge.
453450 """
454451 # TODO: can maybe be improved by sparqling
455452 nodes = list (child .nodes )
@@ -462,11 +459,12 @@ def mutate_add_edge(child):
462459
463460
464461def mutate_increase_dist (child ):
465- """increases distance between source and target by one hop.
462+ """Increases the distance between ? source and ? target by one hop.
466463
467- Adds a triple, to either the source var or the target var.
468- Interchange the new node with source/target variable to increase distance.
469- :return: The modified child, with the new triple.
464+ Randomly adds a var only triple to the ?source or ?target var. Then swaps
465+ the new node with ?source/?target to increase the distance by one hop.
466+
467+ :return: A child with increased distance between ?source and ?target.
470468 """
471469 if not child .complete ():
472470 return child
@@ -485,6 +483,13 @@ def mutate_increase_dist(child):
485483
486484
487485def mutate_fix_var_filter (item_counts ):
486+ """Filters results for fix var mutation.
487+
488+ Excludes:
489+ - too long literals
490+ - URIs with encoding errors (real world!)
491+ - BNode results (they will not be fixed but stay SPARQL vars)
492+ """
488493 assert isinstance (item_counts , Counter )
489494 for i in list (item_counts .keys ()):
490495 if isinstance (i , Literal ):
@@ -527,9 +532,42 @@ def mutate_fix_var(
527532 sample_max_n = config .MUTPB_FV_SAMPLE_MAXN ,
528533 limit = config .MUTPB_FV_QUERY_LIMIT ,
529534):
530- """Chooses a random variable from the pattern(node or edge).
531-
532- Substitutes it with all possible fixed variables.
535+ """Finds possible fixations for a randomly selected variable of the pattern.
536+
537+ This is the a very important mutation of the gp learner, as it is the main
538+ source of actually gaining information from the SPARQL endpoint.
539+
540+ The outline of the mutation is as follows:
541+ - If not passed in, randomly selects a variable (rand_var) of the pattern
542+ (node or edge var, excluding ?source and ?target).
543+ - Randomly selects a subset of up to gtp_sample_max_n GTPs with
544+ probabilities according to their remaining gains. The number of GTPs
545+ picked is randomized (see below).
546+ - Issues SPARQL queries to find possible fixations for the selected variable
547+ under the previously selected GTPs subset. Counts the fixation's
548+ occurrences wrt. the GTPs and sorts the result descending by these counts.
549+ - Limits the result rows to deal with potential long-tails.
550+ - Filters the resulting rows with mutate_fix_var_filter.
551+ - From the limited, filtered result rows randomly selects up to sample_max_n
552+ candidate fixations with probabilities according to their counts.
553+ - For each candidate fixation returns a child in which rand_var is replaced
554+ with the candidate fixation.
555+
556+ The reasons for fixing rand_var based on a randomly sized subset of GTPs
557+ are efficiency and shadowing problems with common long-tails. Due to the
558+ later imposed limit (which is vital in real world use-cases),
559+ a few remaining GTPs that share more than `limit` potential fixations (so
560+ have a common long-tail) could otherwise hide solutions for other
561+ remaining GTPs. This can be the case if these common fixations have low
562+ fitness. By randomizing the subset size, we will eventually (and more
563+ likely) select other combinations of remaining GTPs.
564+
565+ :param gtp_sample_max_n: Maximum GTPs subset size to base fixations on.
566+ :param rand_var: If given uses this variable instead of a random one.
567+ :param sample_max_n: Maximum number of children.
568+ :param limit: SPARQL limit for the top-k result rows.
569+ :return: A list of children in which the selected variable is substituted
570+ with fixation candidates wrt. GTPs.
533571 """
534572 assert isinstance (child , GraphPattern )
535573 assert isinstance (gtp_scores , GTPScores )
0 commit comments