Ticket #1090: 0001-IndexStore-refactoring-and-prefix-term-support.patch

File 0001-IndexStore-refactoring-and-prefix-term-support.patch, 13.4 KB (added by sascha_silbe, 15 years ago)

code for round 4

  • src/carquinyol/indexstore.py

    From 5a950930667055e1757bd24b408a48ddbfc8e4c4 Mon Sep 17 00:00:00 2001
    From: Sascha Silbe <sascha@silbe.org>
    Date: Mon, 17 Aug 2009 19:41:04 +0200
    Subject: [PATCH] IndexStore refactoring and prefix term support
    
    ---
     src/carquinyol/indexstore.py |  287 +++++++++++++++++++++++++----------------
     1 files changed, 175 insertions(+), 112 deletions(-)
    
    diff --git a/src/carquinyol/indexstore.py b/src/carquinyol/indexstore.py
    index 42c3132..7b620b5 100644
    a b  
    1616
    1717import logging
    1818import os
     19import sys
    1920
    2021import gobject
    2122import xapian
    22 from xapian import WritableDatabase, Document, Enquire, Query, QueryParser
     23from xapian import WritableDatabase, Document, Enquire, Query
    2324
    2425from carquinyol import layoutmanager
    2526from carquinyol.layoutmanager import MAX_QUERY_LIMIT
    _VALUE_UID = 0 
    2829_VALUE_TIMESTAMP = 1
    2930_VALUE_TITLE = 2
    3031
     32_PREFIX_NONE = 'N'
     33_PREFIX_FULL_VALUE = 'F'
    3134_PREFIX_UID = 'Q'
    3235_PREFIX_ACTIVITY = 'A'
    3336_PREFIX_ACTIVITY_ID = 'I'
    _FLUSH_THRESHOLD = 20 
    4043# Force a flush after _n_ seconds since the last change to the db
    4144_FLUSH_TIMEOUT = 60
    4245
    43 _PROPERTIES_NOT_TO_INDEX = ['timestamp', 'activity_id', 'keep', 'preview']
     46_PROPERTIES_NOT_TO_INDEX = ['timestamp', 'preview']
    4447
    4548_MAX_RESULTS = int(2 ** 31 - 1)
    4649
     50_QUERY_TERM_MAP = {
     51    'uid': _PREFIX_UID,
     52    'activity': _PREFIX_ACTIVITY,
     53    'activity_id': _PREFIX_ACTIVITY_ID,
     54    'mime_type': _PREFIX_MIME_TYPE,
     55    'keep': _PREFIX_KEEP,
     56}
     57
     58_QUERY_VALUE_MAP = {
     59    'timestamp': _VALUE_TIMESTAMP,
     60}
     61
     62
     63class TermGenerator (xapian.TermGenerator):
     64
     65    def index_document(self, document, properties):
     66        document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp']))
     67        document.add_value(_VALUE_TITLE, properties.get('title', '').strip())
     68
     69        xapian.TermGenerator.set_document(self, document)
     70
     71        properties = dict(properties)
     72        self._index_known(document, properties)
     73        self._index_unknown(document, properties)
     74
     75    def _index_known(self, document, properties):
     76        for name, prefix in _QUERY_TERM_MAP.items():
     77            if (name not in properties):
     78                continue
     79
     80            self._index_property(document, name, properties.pop(name), prefix)
     81
     82    def _index_unknown(self, document, properties):
     83        for name, value in properties.items():
     84            self._index_property(document, name, value)
     85
     86    def _index_property(self, doc, name, value, prefix=''):
     87        if name in _PROPERTIES_NOT_TO_INDEX or not value:
     88            return
     89
     90        if isinstance(value, unicode):
     91            value = value.encode('utf-8')
     92        elif not isinstance(value, basestring):
     93            value = str(value)
     94
     95        # We need to add the full value (i.e. not split into words) so
     96        # we can enumerate unique values. It also simplifies setting up
     97        # dictionary-based queries.
     98        if prefix:
     99            doc.add_term(_PREFIX_FULL_VALUE + prefix + value)
     100
     101        self.index_text(value, 1, prefix or _PREFIX_NONE)
     102        self.increase_termpos()
     103
     104
     105class QueryParser (xapian.QueryParser):
     106    """QueryParser that understands dictionaries and Xapian query strings.
     107
     108    The dictionary contains metadata names as keys and either basic types
     109    (exact match), 2-tuples (range, only valid for value-stored metadata)
     110    or a list (multiple exact matches joined with OR) as values.
     111    An empty dictionary matches everything. Queries from different keys
     112    (i.e. different metadata names) are joined with AND.
     113    """
     114
     115    def __init__(self):
     116        xapian.QueryParser.__init__(self)
     117
     118        for name, prefix in _QUERY_TERM_MAP.items():
     119            self.add_prefix(name, prefix)
     120            self.add_prefix('', prefix)
     121
     122        self.add_prefix('', _PREFIX_NONE)
     123
     124    def _parse_query_term(self, name, prefix, value):
     125        if isinstance(value, list):
     126            subqueries = [self._parse_query_term(name, prefix, word)
     127                for word in value]
     128            return Query(Query.OP_OR, subqueries)
     129
     130        elif prefix:
     131            return Query(_PREFIX_FULL_VALUE + prefix + str(value))
     132        else:
     133            return Query(_PREFIX_NONE + str(value))
     134
     135    def _parse_query_value_range(self, name, value, value_no):
     136        if len(value) != 2:
     137            raise TypeError(
     138                'Only tuples of size 2 have a defined meaning. '
     139                'Did you mean to pass a list instead?')
     140
     141        start, end = value
     142        return Query(Query.OP_VALUE_RANGE, value_no, str(start), str(end))
     143
     144    def _parse_query_value(self, name, value_no, value):
     145        if isinstance(value, list):
     146            subqueries = [self._parse_query_value(name, value_no, word)
     147                for word in value]
     148            return Query(Query.OP_OR, subqueries)
     149
     150        elif isinstance(value, tuple):
     151            return self._parse_query_value_range(name, value, value_no)
     152
     153        elif isinstance(value, dict):
     154            # compatibility option for timestamp: {'start': 0, 'end': 1}
     155            start = value.get('start', 0)
     156            end = value.get('end', sys.maxint)
     157            return self._parse_query_value_range(name, (start, end), value_no)
     158
     159        else:
     160            return Query(Query.OP_VALUE_RANGE,
     161                _QUERY_VALUE_MAP[name], str(value), str(value))
     162
     163    def _parse_query_xapian(self, query_str):
     164        try:
     165            return xapian.QueryParser.parse_query(
     166                self, query_str,
     167                QueryParser.FLAG_PHRASE |
     168                        QueryParser.FLAG_BOOLEAN |
     169                        QueryParser.FLAG_LOVEHATE |
     170                        QueryParser.FLAG_WILDCARD,
     171                '')
     172
     173        except xapian.QueryParserError, exception:
     174            logging.warning('Invalid query string: '+exception.get_msg())
     175            return Query()
     176
     177    def parse_query(self, query_dict, query_string):
     178        logging.debug('parse_query %r %r', query_dict, query_string)
     179        queries = []
     180        query_dict = dict(query_dict)
     181
     182        if query_string:
     183            queries.append(self._parse_query_xapian(str(query_string)))
     184
     185        for name, value in query_dict.items():
     186            if name in _QUERY_TERM_MAP:
     187                queries.append(self._parse_query_term(name,
     188                    _QUERY_TERM_MAP[name], value))
     189            elif name in _QUERY_VALUE_MAP:
     190                queries.append(self._parse_query_value(name,
     191                    _QUERY_VALUE_MAP[name], value))
     192            else:
     193                logging.warning('Unknown term: %r=%r', name, value)
     194
     195        if not queries:
     196            queries.append(Query(''))
     197
     198        if query_dict:
     199            logging.warning('Unknown term(s): %r', query_dict)
     200
     201        logging.debug('queries: %r', [str(q) for q in queries])
     202        return Query(Query.OP_AND, queries)
     203
    47204
    48205class IndexStore(object):
    49206    """Index metadata and provide rich query facilities on it.
    class IndexStore(object): 
    70227            os.remove(os.path.join(index_path, f))
    71228
    72229    def contains(self, uid):
    73         postings = self._database.postlist(_PREFIX_UID + uid)
     230        postings = self._database.postlist(_PREFIX_FULL_VALUE + \
     231            _PREFIX_UID + uid)
    74232        try:
    75233            postlist_item = postings.next()
    76234        except StopIteration:
    class IndexStore(object): 
    79237
    80238    def store(self, uid, properties):
    81239        document = Document()
    82         document.add_term(_PREFIX_UID + uid)
    83         document.add_term(_PREFIX_ACTIVITY + properties.get('activity', ''))
    84         document.add_term(_PREFIX_MIME_TYPE + properties.get('mime_type', ''))
    85         document.add_term(_PREFIX_ACTIVITY_ID +
    86                           properties.get('activity_id', ''))
    87         document.add_term(_PREFIX_KEEP + str(properties.get('keep', 0)))
    88 
    89240        document.add_value(_VALUE_UID, uid)
    90         document.add_value(_VALUE_TIMESTAMP, str(properties['timestamp']))
    91         document.add_value(_VALUE_TITLE, properties.get('title', '').strip())
    92 
    93         term_generator = xapian.TermGenerator()
    94 
    95         # TODO: we should do stemming, but in which language?
    96         #if language is not None:
    97         #    term_generator.set_stemmer(_xapian.Stem(language))
    98 
    99         # TODO: we should use a stopper
    100         #if stop is not None:
    101         #    stopper = _xapian.SimpleStopper()
    102         #    for term in stop:
    103         #        stopper.add (term)
    104         #    term_generator.set_stopper (stopper)
    105 
    106         term_generator.set_document(document)
    107         term_generator.index_text_without_positions(
    108                 self._extract_text(properties), 1, '')
     241        term_generator = TermGenerator()
     242        term_generator.index_document(document, properties)
    109243
    110244        if not self.contains(uid):
    111245            self._database.add_document(document)
    112246        else:
    113             self._database.replace_document(_PREFIX_UID + uid, document)
    114         self._flush()
     247            self._database.replace_document(_PREFIX_FULL_VALUE + \
     248                _PREFIX_UID + uid, document)
    115249
    116     def _extract_text(self, properties):
    117         text = ''
    118         for key, value in properties.items():
    119             if key not in _PROPERTIES_NOT_TO_INDEX:
    120                 if text:
    121                     text += ' '
    122                 if isinstance(value, unicode):
    123                     value = value.encode('utf-8')
    124                 elif not isinstance(value, basestring):
    125                     value = str(value)
    126                 text += value
    127         return text
     250        self._flush()
    128251
    129252    def find(self, query):
    130253        offset = query.pop('offset', 0)
    131254        limit = query.pop('limit', MAX_QUERY_LIMIT)
    132255        order_by = query.pop('order_by', [])
     256        query_string = query.pop('query', None)
    133257
     258        query_parser = QueryParser()
     259        query_parser.set_database(self._database)
    134260        enquire = Enquire(self._database)
    135         enquire.set_query(self._parse_query(query))
     261        enquire.set_query(query_parser.parse_query(query, query_string))
    136262
    137263        # This will assure that the results count is exact.
    138264        check_at_least = offset + limit + 1
    class IndexStore(object): 
    151277        elif order_by == '-title':
    152278            enquire.set_sort_by_value(_VALUE_TITLE, False)
    153279        else:
    154             logging.warning('Unsupported property for sorting: %s' % order_by)
     280            logging.warning('Unsupported property for sorting: %s', order_by)
    155281
    156282        query_result = enquire.get_mset(offset, limit, check_at_least)
    157283        total_count = query_result.get_matches_estimated()
    class IndexStore(object): 
    162288
    163289        return (uids, total_count)
    164290
    165     def _parse_query(self, query_dict):
    166         logging.debug('_parse_query %r' % query_dict)
    167         queries = []
    168 
    169         query_str = query_dict.pop('query', None)
    170         if query_str is not None:
    171             query_parser = QueryParser()
    172             query_parser.set_database(self._database)
    173             #query_parser.set_default_op(Query.OP_AND)
    174 
    175             # TODO: we should do stemming, but in which language?
    176             #query_parser.set_stemmer(_xapian.Stem(lang))
    177             #query_parser.set_stemming_strategy(qp.STEM_SOME)
    178 
    179             query = query_parser.parse_query(
    180                     query_str,
    181                     QueryParser.FLAG_PHRASE |
    182                             QueryParser.FLAG_BOOLEAN |
    183                             QueryParser.FLAG_LOVEHATE |
    184                             QueryParser.FLAG_WILDCARD,
    185                     '')
    186 
    187             queries.append(query)
    188 
    189         timestamp = query_dict.pop('timestamp', None)
    190         if timestamp is not None:
    191             start = str(timestamp.pop('start', 0))
    192             end = str(timestamp.pop('end', _MAX_RESULTS))
    193             query = Query(Query.OP_VALUE_RANGE, _VALUE_TIMESTAMP, start, end)
    194             queries.append(query)
    195 
    196         uid = query_dict.pop('uid', None)
    197         if uid is not None:
    198             queries.append(Query(_PREFIX_UID + uid))
    199 
    200         activity = query_dict.pop('activity', None)
    201         if activity is not None:
    202             queries.append(Query(_PREFIX_ACTIVITY + activity))
    203 
    204         activity_id = query_dict.pop('activity_id', None)
    205         if activity_id is not None:
    206             query = Query(_PREFIX_ACTIVITY_ID + activity_id)
    207             queries.append(query)
    208 
    209         keep = query_dict.pop('keep', None)
    210         if keep is not None:
    211             query = Query(_PREFIX_KEEP + str(keep))
    212             queries.append(query)
    213 
    214         mime_type = query_dict.pop('mime_type', None)
    215         if mime_type is not None:
    216             mime_queries = []
    217             for mime_type in mime_type:
    218                 mime_queries.append(Query(_PREFIX_MIME_TYPE + mime_type))
    219             queries.append(Query(Query.OP_OR, mime_queries))
    220 
    221         if not queries:
    222             queries.append(Query(''))
    223 
    224         if query_dict:
    225             logging.warning('Unknown term(s): %r' % query_dict)
    226 
    227         return Query(Query.OP_AND, queries)
    228 
    229291    def delete(self, uid):
    230         self._database.delete_document(_PREFIX_UID + uid)
     292        self._database.delete_document(_PREFIX_FULL_VALUE + _PREFIX_UID + uid)
    231293
    232294    def get_activities(self):
    233295        activities = []
    234         for term in self._database.allterms(_PREFIX_ACTIVITY):
    235             activities.append(term.term[len(_PREFIX_ACTIVITY):])
     296        prefix = _PREFIX_FULL_VALUE + _PREFIX_ACTIVITY
     297        for term in self._database.allterms(prefix):
     298            activities.append(term.term[len(prefix):])
    236299        return activities
    237300
    238301    def _flush_timeout_cb(self):