|
16 | 16 | import logging |
17 | 17 | import random |
18 | 18 | import string |
| 19 | +import textwrap |
19 | 20 |
|
20 | 21 | import deap |
21 | 22 | import deap.base |
|
31 | 32 | import six |
32 | 33 |
|
33 | 34 | from utils import URIShortener |
34 | | -import config |
35 | 35 |
|
36 | 36 | logger = logging.getLogger(__name__) |
37 | 37 |
|
@@ -717,84 +717,101 @@ def to_count_var_over_values_query(self, var, vars_, values, limit): |
717 | 717 | res += 'LIMIT %d\n' % limit |
718 | 718 | return self._sparql_prefix(res) |
719 | 719 |
|
720 | | - def to_find_edge_var_for_narrow_path_query( |
721 | | - self, edge_var, node_var, vars_, values, limit_res, |
722 | | - filter_node_count=config.MUTPB_DN_FILTER_NODE_COUNT, |
723 | | - filter_edge_count=config.MUTPB_DN_FILTER_EDGE_COUNT, |
| 720 | + def to_deep_narrow_path_query( |
| 721 | + self, edge_var, node_var, vars_, values, |
| 722 | + limit, max_node_count, min_edge_count, |
724 | 723 | ): |
725 | | - """Counts possible substitutions for edge_var to get a narrow path |
| 724 | + """Counts possible substitutions for edge_var to get a narrow path. |
726 | 725 |
|
727 | 726 | Meant to perform a query like this: |
728 | | - SELECT * |
729 | | - { |
| 727 | + PREFIX dbr: <http://dbpedia.org/resource/> |
| 728 | + SELECT * WHERE { |
730 | 729 | { |
731 | | - SELECT |
732 | | - ?edge_var |
733 | | - (COUNT(*) AS ?edge_var_count) |
734 | | - (MAX(?node_var_count) AS ?max_node_count) |
735 | | - (COUNT(*)/AVG(?node_var_count) as ?prio_var) |
736 | | - { |
737 | | - SELECT DISTINCT |
738 | | - ?source ?target ?edge_var (COUNT(?node_var) AS ?node_var_count) |
739 | | - { |
740 | | - VALUES (?source ?target) { |
741 | | - (dbr:Adolescence dbr:Youth) |
742 | | - (dbr:Adult dbr:Child) |
743 | | - (dbr:Angel dbr:Heaven) |
744 | | - (dbr:Arithmetic dbr:Mathematics) |
745 | | - } |
746 | | - ?node_var ?edge_var ?source . |
747 | | - ?source dbo:wikiPageWikiLink ?target . |
748 | | - } |
| 730 | + SELECT ?edge_var |
| 731 | + (SUM(?node_var_count) AS ?node_var_sum) |
| 732 | + (COUNT(?source && ?target) AS ?edge_var_count) |
| 733 | + (MAX(?node_var_count) AS ?max_node_count) |
| 734 | + WHERE { |
| 735 | + SELECT DISTINCT ?source ?target ?edge_var |
| 736 | + (COUNT(?node_var) AS ?node_var_count) |
| 737 | + WHERE { |
| 738 | + VALUES (?source ?target) { |
| 739 | + (dbr:Barrel dbr:Wine) |
| 740 | + (dbr:Barrister dbr:Law) |
| 741 | + (dbr:Beak dbr:Bird) |
| 742 | + (dbr:Blanket dbr:Bed) |
| 743 | + } |
| 744 | + ?node_var ?edge_var ?source . |
| 745 | + ?source <http://dbpedia.org/ontology/wikiPageWikiLink> ?target . |
749 | 746 | } |
750 | | - GROUP BY ?edge_var |
751 | | - ORDER BY DESC(?edge_var_count) |
| 747 | + } |
| 748 | + GROUP BY ?edge_var |
752 | 749 | } |
753 | | - FILTER(?max_node_count < 10 && ?edge_var_count > 1) |
754 | | - } |
755 | | - ORDER BY DESC(?prio_var) |
756 | | - LIMIT 32 |
| 750 | + FILTER(?max_node_count <= 10 |
| 751 | + && ?edge_var_count >= 2) |
| 752 | + } |
| 753 | + ORDER BY DESC(?edge_var_count) ASC(?node_var_sum) |
| 754 | + LIMIT 32 |
| 755 | +
|
| 756 | + The idea here is to expand a random node (?source in the example above) |
| 757 | + with new variable triple and then try to fix its edge in a way that the |
| 758 | + degree (?node_var_count) isn't too high (<= max_node_count). We're also |
| 759 | + interested in the avg degree being low. In light of query chunking the |
| 760 | + sum is returned here (instead of AVG). |
| 761 | +
|
| 762 | + Apart from minimizing the degrees, we would also like to maximize the |
| 763 | + number of stps an ?edge_var fixation is valid for (?edge_var_count). |
| 764 | +
|
| 765 | + See gp_learner.mutate_deep_narrow_path() for more. |
757 | 766 |
|
758 | 767 | :param edge_var: Edge variable to find substitution for. |
759 | 768 | :param node_var: Node variable to count. |
760 | 769 | :param vars_: List of vars to fix values for (e.g. ?source, ?target). |
761 | 770 | :param values: List of value lists for vars_. |
762 | | - :param filter_node_count: Filter on node count of edge variable. |
763 | | - :param filter_edge_count: Filter for edge count of triples. |
764 | | - :param limit_res : limit result size |
| 771 | + :param max_node_count: Filter on node count of edge variable. |
| 772 | + :param min_edge_count: Filter for edge count of triples. |
| 773 | + :param limit : limit result size. |
765 | 774 | :return: Query String. |
766 | 775 | """ |
767 | 776 |
|
768 | | - res = 'SELECT * WHERE {\n' |
769 | | - res += ' {\n'\ |
770 | | - ' SELECT %s (SUM (?node_var_count) AS %s) (COUNT(%s) AS %s) ' \ |
771 | | - '(MAX(?node_var_count) AS ?max_node_count) WHERE {\n' % ( |
772 | | - edge_var.n3(), |
773 | | - NODE_VAR_SUM.n3(), |
774 | | - ' && '.join([v.n3() for v in vars_]), |
775 | | - EDGE_VAR_COUNT.n3(), ) |
776 | | - res += ' SELECT DISTINCT %s %s (COUNT(%s) AS ?node_var_count) ' \ |
777 | | - 'WHERE {\n ' % (' '.join([v.n3() for v in vars_]), |
778 | | - edge_var.n3(), node_var.n3(), ) |
779 | | - res += self._sparql_values_part(values) |
780 | | - |
781 | | - # triples part |
782 | | - tres = [] |
783 | | - for s, p, o in self: |
784 | | - tres.append('%s %s %s .' % (s.n3(), p.n3(), o.n3())) |
785 | | - indent = ' ' * 3 |
786 | | - triples = indent + ('\n' + indent).join(tres) + '\n' |
787 | | - res += triples |
788 | | - res += ' }\n'\ |
789 | | - ' }\n' |
790 | | - res += ' GROUP BY %s\n' % edge_var.n3() |
791 | | - res += ' }\n' |
792 | | - res += ' FILTER(?max_node_count < %d && %s > %d)\n' \ |
793 | | - % (filter_node_count, EDGE_VAR_COUNT.n3(), |
794 | | - filter_edge_count) |
795 | | - res += '}\n' |
796 | | - res += 'ORDER BY ASC(%s)\n' % NODE_VAR_SUM.n3() |
797 | | - res += 'LIMIT %d' % limit_res |
| 777 | + res = '''\ |
| 778 | + SELECT * WHERE { |
| 779 | + { |
| 780 | + SELECT %(edge_var)s |
| 781 | + (SUM(?node_var_count) AS %(node_var_sum)s) |
| 782 | + (COUNT(%(vars_and)s) AS %(edge_var_count)s) |
| 783 | + (MAX(?node_var_count) AS ?max_node_count) |
| 784 | + WHERE { |
| 785 | + SELECT DISTINCT %(vars)s %(edge_var)s |
| 786 | + (COUNT(%(node_var)s) AS ?node_var_count) |
| 787 | + WHERE {\n%(values_part)s %(triples)s |
| 788 | + } |
| 789 | + } |
| 790 | + GROUP BY %(edge_var)s |
| 791 | + } |
| 792 | + FILTER(?max_node_count <= %(max_node_count)d |
| 793 | + && %(edge_var_count)s >= %(min_edge_count)d) |
| 794 | + } |
| 795 | + ORDER BY DESC(%(edge_var_count)s) ASC(%(node_var_sum)s) |
| 796 | + LIMIT %(limit)d |
| 797 | + ''' % { |
| 798 | + # TODO: adapt self._sparql_values_part for template use (indent) |
| 799 | + 'edge_var': edge_var.n3(), |
| 800 | + 'node_var_sum': NODE_VAR_SUM.n3(), |
| 801 | + 'vars_and': ' && '.join([v.n3() for v in vars_]), |
| 802 | + 'edge_var_count': EDGE_VAR_COUNT.n3(), |
| 803 | + 'vars': ' '.join([v.n3() for v in vars_]), |
| 804 | + 'node_var': node_var.n3(), |
| 805 | + 'values_part': self._sparql_values_part( |
| 806 | + values, indent=' '), |
| 807 | + 'triples': '\n '.join( |
| 808 | + '%s %s %s .' % (s.n3(), p.n3(), o.n3()) for s, p, o in self |
| 809 | + ), |
| 810 | + 'limit': limit, |
| 811 | + 'max_node_count': max_node_count, |
| 812 | + 'min_edge_count': min_edge_count, |
| 813 | + } |
| 814 | + res = textwrap.dedent(res) |
798 | 815 | return self._sparql_prefix(res) |
799 | 816 |
|
800 | 817 | def to_dict(self): |
|
0 commit comments