Ticket #1090: sugar-datastore-prefixes.patch

File sugar-datastore-prefixes.patch, 12.4 KB (added by sascha_silbe, 14 years ago)

round 3

  • src/carquinyol/indexstore.py

    diff --git a/src/carquinyol/indexstore.py b/src/carquinyol/indexstore.py
    index 42c3132..45f09a9 100644
    a b  
    1616
    1717import logging
    1818import os
     19import sys
     20import time
    1921
    2022import gobject
    2123import xapian
    22 from xapian import WritableDatabase, Document, Enquire, Query, QueryParser
     24from xapian import WritableDatabase, Document, Enquire, Query
    2325
    2426from carquinyol import layoutmanager
    2527from carquinyol.layoutmanager import MAX_QUERY_LIMIT
    _FLUSH_THRESHOLD = 20 
    4042# Force a flush after _n_ seconds since the last change to the db
    4143_FLUSH_TIMEOUT = 60
    4244
    43 _PROPERTIES_NOT_TO_INDEX = ['timestamp', 'activity_id', 'keep', 'preview']
     45_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'preview']
    4446
    4547_MAX_RESULTS = int(2 ** 31 - 1)
    4648
     49_QUERY_TERM_MAP = {
     50    'uid': _PREFIX_UID,
     51    'activity': _PREFIX_ACTIVITY,
     52    'activity_id': _PREFIX_ACTIVITY_ID,
     53    'mime_type': _PREFIX_MIME_TYPE,
     54    'keep': _PREFIX_KEEP,
     55}
     56
     57_QUERY_VALUE_MAP = {
     58    'timestamp': _VALUE_TIMESTAMP,
     59}
     60
     61
     62class TermGenerator (xapian.TermGenerator):
     63
     64    def index_document(self, document, properties):
     65        document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp']))
     66        document.add_value(_VALUE_TITLE, properties.get('title', '').strip())
     67
     68        xapian.TermGenerator.set_document(self, document)
     69
     70        properties = dict(properties)
     71        self._index_known(document, properties)
     72        self._index_unknown(document, properties)
     73
     74    def _index_known(self, document, properties):
     75        for name, prefix in _QUERY_TERM_MAP.items():
     76            if (name not in properties):
     77                continue
     78
     79            self._index_property(document, name, properties.pop(name), prefix)
     80
     81    def _index_unknown(self, document, properties):
     82        for name, value in properties.items():
     83            self._index_property(document, name, value)
     84
     85    def _index_property(self, doc, name, value, prefix=''):
     86        if name in _PROPERTIES_NOT_TO_INDEX or not value:
     87            return
     88
     89        if isinstance(value, unicode):
     90            value = value.encode('utf-8')
     91        elif not isinstance(value, basestring):
     92            value = str(value)
     93
     94        # We need to add the full value (i.e. not split into words) so
     95        # dictionary-based queries work (they don't split the value
     96        # into words).
     97        # TODO: change query parser to generate phrase query instead
     98        doc.add_term(prefix + value)
     99
     100        # We need to index both with and without prefix because Xapian
     101        # only matches against non-prefix terms if no prefix is given
     102        # inside the query.
     103        if prefix:
     104            self.index_text(value, 1, prefix)
     105
     106        self.index_text(value)
     107        self.increase_termpos()
     108
     109
     110class QueryParser (xapian.QueryParser):
     111    """QueryParser that understands dictionaries and Xapian query strings.
     112
     113    The dictionary contains metadata names as keys and either basic types
     114    (exact match), 2-tuples (range, only valid for value-stored metadata)
     115    or a list (multiple exact matches joined with OR) as values.
     116    An empty dictionary matches everything. Queries from different keys
     117    (i.e. different metadata names) are joined with AND.
     118    """
     119
     120    def __init__(self):
     121        xapian.QueryParser.__init__(self)
     122
     123        for name, prefix in _QUERY_TERM_MAP.items():
     124            self.add_prefix(name, prefix)
     125
     126    def _parse_query_term(self, name, prefix, value):
     127        if isinstance(m_value, list):
     128            subqueries = [self._parse_query_term(name, prefix, word)
     129                for word in value]
     130            return Query(Query.OP_OR, subqueries)
     131
     132        else:
     133            return Query(prefix+str(value))
     134
     135    def _parse_query_value_range(self, name, value, value_no):
     136        if len(value) != 2:
     137            raise TypeError(
     138                'Only tuples of size 2 have a defined meaning. '
     139                'Did you mean to pass a list instead?')
     140
     141        start, end = value
     142        return Query(Query.OP_VALUE_RANGE, value_no, str(start), str(end))
     143
     144    def _parse_query_value(self, name, value_no, value):
     145        if isinstance(value, list):
     146            subqueries = [self._parse_query_value(name, value_no, word)
     147                for word in value]
     148            return Query(Query.OP_OR, subqueries)
     149
     150        elif isinstance(value, tuple):
     151            return self._parse_query_value_range(name, value, value_no)
     152
     153        elif isinstance(value, dict):
     154            # compatibility option for timestamp: {'start': 0, 'end': 1}
     155            start = value.get('start', 0)
     156            end = value.get('end', sys.maxint)
     157            return self._parse_query_value_range(name, (start, end), value_no)
     158
     159        else:
     160            return Query(Query.OP_VALUE_RANGE,
     161                _QUERY_VALUE_MAP[name], str(value), str(value))
     162
     163    def _parse_query_xapian(self, query_str):
     164        try:
     165            return xapian.QueryParser.parse_query(
     166                self, query_str,
     167                QueryParser.FLAG_PHRASE |
     168                        QueryParser.FLAG_BOOLEAN |
     169                        QueryParser.FLAG_LOVEHATE |
     170                        QueryParser.FLAG_WILDCARD,
     171                '')
     172
     173        except xapian.QueryParserError, exception:
     174            logging.warning('Invalid query string: '+exception.get_msg())
     175            return Query()
     176
     177    def parse_query(self, query_dict, query_string):
     178        logging.debug('parse_query %r %r', query_dict, query_string)
     179        queries = []
     180        query_dict = dict(query_dict)
     181
     182        if query_string:
     183            queries.append(self._parse_query_xapian(str(query_string)))
     184
     185        queries += [
     186            self._parse_query_term(name, prefix, query_dict.pop(name))
     187            for name, prefix in _QUERY_TERM_MAP.items()
     188            if name in query_dict]
     189
     190        queries += [
     191            self._parse_query_value(name, value_no, query_dict.pop(name))
     192            for name, value_no in _QUERY_VALUE_MAP.items()
     193            if name in query_dict]
     194
     195        if not queries:
     196            queries.append(Query(''))
     197
     198        if query_dict:
     199            logging.warning('Unknown term(s): %r', query_dict)
     200
     201        logging.debug('queries: %r', [str(q) for q in queries])
     202        return Query(Query.OP_AND, queries)
     203
    47204
    48205class IndexStore(object):
    49206    """Index metadata and provide rich query facilities on it.
    class IndexStore(object): 
    53210        self._database = None
    54211        self._flush_timeout = None
    55212        self._pending_writes = 0
     213        self._query_parser = None
     214        self._term_generator = None
    56215
    57216    def open_index(self):
    58217        index_path = layoutmanager.get_instance().get_index_path()
    59218        self._database = WritableDatabase(index_path, xapian.DB_CREATE_OR_OPEN)
    60219
     220        self._query_parser = QueryParser()
     221        self._query_parser.set_database(self._database)
     222        self._term_generator = TermGenerator()
     223
    61224    def close_index(self):
    62225        self._database.flush()
    63226        self._database = None
    class IndexStore(object): 
    79242
    80243    def store(self, uid, properties):
    81244        document = Document()
    82         document.add_term(_PREFIX_UID + uid)
    83         document.add_term(_PREFIX_ACTIVITY + properties.get('activity', ''))
    84         document.add_term(_PREFIX_MIME_TYPE + properties.get('mime_type', ''))
    85         document.add_term(_PREFIX_ACTIVITY_ID +
    86                           properties.get('activity_id', ''))
    87         document.add_term(_PREFIX_KEEP + str(properties.get('keep', 0)))
    88 
    89245        document.add_value(_VALUE_UID, uid)
    90         document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp']))
    91         document.add_value(_VALUE_TITLE, properties.get('title', '').strip())
    92 
    93         term_generator = xapian.TermGenerator()
    94 
    95         # TODO: we should do stemming, but in which language?
    96         #if language is not None:
    97         #    term_generator.set_stemmer(_xapian.Stem(language))
    98 
    99         # TODO: we should use a stopper
    100         #if stop is not None:
    101         #    stopper = _xapian.SimpleStopper()
    102         #    for term in stop:
    103         #        stopper.add (term)
    104         #    term_generator.set_stopper (stopper)
    105 
    106         term_generator.set_document(document)
    107         term_generator.index_text_without_positions(
    108                 self._extract_text(properties), 1, '')
     246        self._term_generator.index_document(document, properties)
    109247
    110248        if not self.contains(uid):
    111249            self._database.add_document(document)
    112250        else:
    113251            self._database.replace_document(_PREFIX_UID + uid, document)
    114         self._flush()
    115252
    116     def _extract_text(self, properties):
    117         text = ''
    118         for key, value in properties.items():
    119             if key not in _PROPERTIES_NOT_TO_INDEX:
    120                 if text:
    121                     text += ' '
    122                 if isinstance(value, unicode):
    123                     value = value.encode('utf-8')
    124                 elif not isinstance(value, basestring):
    125                     value = str(value)
    126                 text += value
    127         return text
     253        self._flush()
    128254
    129255    def find(self, query):
    130256        offset = query.pop('offset', 0)
    131257        limit = query.pop('limit', MAX_QUERY_LIMIT)
    132258        order_by = query.pop('order_by', [])
     259        query_string = query.pop('query', None)
    133260
    134261        enquire = Enquire(self._database)
    135         enquire.set_query(self._parse_query(query))
     262        enquire.set_query(self._query_parser.parse_query(query, query_string))
    136263
    137264        # This will assure that the results count is exact.
    138265        check_at_least = offset + limit + 1
    class IndexStore(object): 
    151278        elif order_by == '-title':
    152279            enquire.set_sort_by_value(_VALUE_TITLE, False)
    153280        else:
    154             logging.warning('Unsupported property for sorting: %s' % order_by)
     281            logging.warning('Unsupported property for sorting: %s', order_by)
    155282
    156283        query_result = enquire.get_mset(offset, limit, check_at_least)
    157284        total_count = query_result.get_matches_estimated()
    class IndexStore(object): 
    162289
    163290        return (uids, total_count)
    164291
    165     def _parse_query(self, query_dict):
    166         logging.debug('_parse_query %r' % query_dict)
    167         queries = []
    168 
    169         query_str = query_dict.pop('query', None)
    170         if query_str is not None:
    171             query_parser = QueryParser()
    172             query_parser.set_database(self._database)
    173             #query_parser.set_default_op(Query.OP_AND)
    174 
    175             # TODO: we should do stemming, but in which language?
    176             #query_parser.set_stemmer(_xapian.Stem(lang))
    177             #query_parser.set_stemming_strategy(qp.STEM_SOME)
    178 
    179             query = query_parser.parse_query(
    180                     query_str,
    181                     QueryParser.FLAG_PHRASE |
    182                             QueryParser.FLAG_BOOLEAN |
    183                             QueryParser.FLAG_LOVEHATE |
    184                             QueryParser.FLAG_WILDCARD,
    185                     '')
    186 
    187             queries.append(query)
    188 
    189         timestamp = query_dict.pop('timestamp', None)
    190         if timestamp is not None:
    191             start = str(timestamp.pop('start', 0))
    192             end = str(timestamp.pop('end', _MAX_RESULTS))
    193             query = Query(Query.OP_VALUE_RANGE, _VALUE_TIMESTAMP, start, end)
    194             queries.append(query)
    195 
    196         uid = query_dict.pop('uid', None)
    197         if uid is not None:
    198             queries.append(Query(_PREFIX_UID + uid))
    199 
    200         activity = query_dict.pop('activity', None)
    201         if activity is not None:
    202             queries.append(Query(_PREFIX_ACTIVITY + activity))
    203 
    204         activity_id = query_dict.pop('activity_id', None)
    205         if activity_id is not None:
    206             query = Query(_PREFIX_ACTIVITY_ID + activity_id)
    207             queries.append(query)
    208 
    209         keep = query_dict.pop('keep', None)
    210         if keep is not None:
    211             query = Query(_PREFIX_KEEP + str(keep))
    212             queries.append(query)
    213 
    214         mime_type = query_dict.pop('mime_type', None)
    215         if mime_type is not None:
    216             mime_queries = []
    217             for mime_type in mime_type:
    218                 mime_queries.append(Query(_PREFIX_MIME_TYPE + mime_type))
    219             queries.append(Query(Query.OP_OR, mime_queries))
    220 
    221         if not queries:
    222             queries.append(Query(''))
    223 
    224         if query_dict:
    225             logging.warning('Unknown term(s): %r' % query_dict)
    226 
    227         return Query(Query.OP_AND, queries)
    228 
    229292    def delete(self, uid):
    230293        self._database.delete_document(_PREFIX_UID + uid)
    231294