|
48 | 48 | from gp_query import predict_query |
49 | 49 | from gp_query import query_time_hard_exceeded |
50 | 50 | from gp_query import query_time_soft_exceeded |
| 51 | +from gp_query import variable_substitution_deep_narrow_mut_query |
51 | 52 | from gp_query import variable_substitution_query |
52 | 53 | from graph_pattern import canonicalize |
53 | 54 | from graph_pattern import gen_random_var |
@@ -653,6 +654,105 @@ def mutate_fix_var( |
653 | 654 | return res |
654 | 655 |
|
655 | 656 |
|
| 657 | +def _mutate_deep_narrow_path_helper( |
| 658 | + sparql, |
| 659 | + timeout, |
| 660 | + gtp_scores, |
| 661 | + child, |
| 662 | + edge_var, |
| 663 | + node_var, |
| 664 | + gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N, |
| 665 | + limit_res=config.MUTPB_DN_QUERY_LIMIT, |
| 666 | + sample_n=config.MUTPB_FV_SAMPLE_MAXN, |
| 667 | +): |
| 668 | + assert isinstance(child, GraphPattern) |
| 669 | + assert isinstance(gtp_scores, GTPScores) |
| 670 | + |
| 671 | + # The further we get, the less gtps are remaining. Sampling too many (all) |
| 672 | + # of them might hurt as common substitutions (> limit ones) which are dead |
| 673 | + # ends could cover less common ones that could actually help |
| 674 | + gtp_sample_n = min(gtp_sample_n, int(gtp_scores.remaining_gain)) |
| 675 | + gtp_sample_n = random.randint(1, gtp_sample_n) |
| 676 | + |
| 677 | + ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps( |
| 678 | + n=gtp_sample_n) |
| 679 | + t, substitution_counts = variable_substitution_deep_narrow_mut_query( |
| 680 | + sparql, timeout, child, edge_var, node_var, ground_truth_pairs, |
| 681 | + limit_res) |
| 682 | + edge_count, node_sum_count = substitution_counts |
| 683 | + if not node_sum_count: |
| 684 | + # the current pattern is unfit, as we can't find anything fulfilling it |
| 685 | + logger.debug("tried to fix a var %s without result:\n%s" |
| 686 | + "seems as if the pattern can't be fulfilled!", |
| 687 | + edge_var, child.to_sparql_select_query()) |
| 688 | + fixed = False |
| 689 | + return child, fixed |
| 690 | + mutate_fix_var_filter(node_sum_count) |
| 691 | + mutate_fix_var_filter(edge_count) |
| 692 | + if not node_sum_count: |
| 693 | + # could have happened that we removed the only possible substitution |
| 694 | + fixed = False |
| 695 | + return child, fixed |
| 696 | + |
| 697 | + prio = Counter() |
| 698 | + for edge, node_sum in node_sum_count.items(): |
| 699 | + ec = edge_count[edge] |
| 700 | + prio[edge] = ec / (node_sum / ec) # ec / AVG degree |
| 701 | + # randomly pick n of the substitutions with a prob ~ to their counts |
| 702 | + edges, prios = zip(*prio.most_common()) |
| 703 | + |
| 704 | + substs = sample_from_list(edges, prios, sample_n) |
| 705 | + |
| 706 | + logger.info( |
| 707 | + 'fixed variable %s in %sto:\n %s\n<%d out of:\n%s\n', |
| 708 | + edge_var.n3(), |
| 709 | + child, |
| 710 | + '\n '.join([subst.n3() for subst in substs]), |
| 711 | + sample_n, |
| 712 | + '\n'.join([ |
| 713 | + ' %.3f: %s' % (c, v.n3()) for v, c in prio.most_common()]), |
| 714 | + ) |
| 715 | + fixed = True |
| 716 | + orig_child = child |
| 717 | + children = [ |
| 718 | + GraphPattern(child, mapping={edge_var: subst}) |
| 719 | + for subst in substs |
| 720 | + ] |
| 721 | + children = [ |
| 722 | + c if fit_to_live(c) else orig_child |
| 723 | + for c in children |
| 724 | + ] |
| 725 | + if children: |
| 726 | + child = random.choice(list(children)) |
| 727 | + return child, fixed |
| 728 | + |
| 729 | + |
| 730 | +def mutate_deep_narrow_path( |
| 731 | + child, sparql, timeout, gtp_scores, |
| 732 | + min_len=config.MUTPB_DN_MIN_LEN, |
| 733 | + max_len=config.MUTPB_DN_MAX_LEN, |
| 734 | + term_pb=config.MUTPB_DN_TERM_PB, |
| 735 | +): |
| 736 | + assert isinstance(child, GraphPattern) |
| 737 | + nodes = list(child.nodes) |
| 738 | + start_node = random.choice(nodes) |
| 739 | + # target_nodes = set(nodes) - {start_node} |
| 740 | + gp = child |
| 741 | + hop = 0 |
| 742 | + while True: |
| 743 | + if hop >= min_len and random.random() < term_pb: |
| 744 | + break |
| 745 | + if hop >= max_len: |
| 746 | + break |
| 747 | + hop += 1 |
| 748 | + new_triple, var_node, var_edge = _mutate_expand_node_helper(start_node) |
| 749 | + gp += [new_triple] |
| 750 | + gp, fixed = _mutate_deep_narrow_path_helper( |
| 751 | + sparql, timeout, gtp_scores, gp, var_edge, var_node) |
| 752 | + start_node = var_node |
| 753 | + return gp |
| 754 | + |
| 755 | + |
656 | 756 | def mutate_simplify_pattern(gp): |
657 | 757 | if len(gp) < 2: |
658 | 758 | return gp |
@@ -757,6 +857,7 @@ def mutate( |
757 | 857 | pb_dt=config.MUTPB_DT, |
758 | 858 | pb_en=config.MUTPB_EN, |
759 | 859 | pb_fv=config.MUTPB_FV, |
| 860 | + pb_dn=config.MUTPB_DN, |
760 | 861 | pb_id=config.MUTPB_ID, |
761 | 862 | pb_iv=config.MUTPB_IV, |
762 | 863 | pb_mv=config.MUTPB_MV, |
@@ -796,15 +897,15 @@ def mutate( |
796 | 897 | if random.random() < pb_sp: |
797 | 898 | child = mutate_simplify_pattern(child) |
798 | 899 |
|
| 900 | + if random.random() < pb_dn: |
| 901 | + child = mutate_deep_narrow_path(child, sparql, timeout, gtp_scores) |
| 902 | + |
799 | 903 | if random.random() < pb_fv: |
800 | 904 | child = canonicalize(child) |
801 | 905 | children = mutate_fix_var(sparql, timeout, gtp_scores, child) |
802 | 906 | else: |
803 | 907 | children = [child] |
804 | 908 |
|
805 | | - |
806 | | - # TODO: deep & narrow paths mutation |
807 | | - |
808 | 909 | children = { |
809 | 910 | c if fit_to_live(c) else orig_child |
810 | 911 | for c in children |
|
0 commit comments