root/projects/sAsync/trunk/sasync/search.py

Revision 3, 9.0 kB (checked in by edsuom, 1 year ago)

Import of trunk from old repo

Line 
1 # sAsync:
2 # An enhancement to the SQLAlchemy package that provides persistent
3 # dictionaries, text indexing and searching, and an access broker for
4 # conveniently managing database access, table setup, and
5 # transactions. Everything can be run in an asynchronous fashion using the
6 # Twisted framework and its deferred processing capabilities.
7 #
8 # Copyright (C) 2006 by Edwin A. Suominen, http://www.eepatents.com
9 #
10 # This program is free software; you can redistribute it and/or modify it under
11 # the terms of the GNU General Public License as published by the Free Software
12 # Foundation; either version 2 of the License, or (at your option) any later
13 # version.
14 #
15 # This program is distributed in the hope that it will be useful, but WITHOUT
16 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 # FOR A PARTICULAR PURPOSE.  See the file COPYING for more details.
18 #
19 # You should have received a copy of the GNU General Public License along with
20 # this program; if not, write to the Free Software Foundation, Inc., 51
21 # Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22
23 """
24 Text indexing and searching (TODO)
25
26 """
27
28 # Imports
29 from twisted.internet import defer
30 import sqlalchemy as SA
31 from sasync.database import AccessBroker
32
33 # Config
34 TRUNCATED_WORD_LENGTH   = 20
35
36
37 class Records:
38     """
39     Abstract base for record keeping classes
40     """
41     pass
42
43
44 class DatabaseRecords(Records):
45     """
46     I keep text records in the database of my searcher parent
47     """
48     def startup(self, parent):
49         self.parent = parent
50         return parent.table(
51             'records',
52             SA.Column('doc_id', SA.Integer, index="section"),
53             SA.Column('section_id', SA.Integer, index="section"),
54             SA.Column('text', SA.String, nullable=False)
55             )
56
57     def addRecord(self, record, document=None, section=None):
58         """
59         Adds a I{record} supplied as a Python object with a unique integer
60         I{document} identifier. The Python object must have a string
61         representation that provides its text content. You can supply a unique
62         integer I{section} as a keyword.
63
64         @return: A C{Deferred} to a list of unique words extracted from the
65             file's plain text content for indexing.
66             
67         """
68         if not instance(document, int):
69             raise ValueError("You must supply an integer document ID")
70         pass
71
72     def getRecord(self, document, section=None, first=None, last=None):
73         """
74         Returns a C{Deferred} to the text content of the I{document},
75         optionally limited to a particular I{section}.
76
77         The text content to be returned can be restricted to a block of text
78         starting at a I{first} word and ending at a I{last} word, with the word
79         positions supplied as integer keywords.
80         """
81         pass
82
83
84 class FileRecords(Records):
85     """
86     I use existing files as text records
87     """
88     def startup(self, parent):
89         self.parent = parent
90         return defer.succeed(None)
91
92     def addRecord(self, record, document=None, section=None):
93         """
94         Adds a I{record} supplied as the valid path of a file. A hash of the
95         unique file path is used as the document identifier and only the
96         default section is used. Thus any I{document} or I{section} keyword IDs
97         supplied are ignored.
98
99         @return: A C{Deferred} to a list of unique words extracted from the
100             file's plain text content for indexing.
101             
102         """
103         pass
104
105     def getRecord(self, document, section=None, first=None, last=None):
106         """
107         Returns a C{Deferred} to the text content of the I{document} supplied
108         as the valid path of a file that has been added as a record. Any
109         I{section} keyword supplied is ignored because different sections of
110         files are not recognized.
111
112         The text content to be returned can be restricted to a block of text
113         starting at a I{first} word and ending at a I{last} word, with the word
114         positions supplied as integer keywords.
115         """
116         if not instance(document, int):
117             raise ValueError("You must supply an integer document ID")
118         pass
119
120
121 class Search(AccessBroker):
122     """
123     I provide an interface for indexing terms of new records and searching for
124     text contained within records already indexed.
125
126     I am instantiated with a reference to whatever subclass of L{Records} I
127     should instantiate to extract text from objects presented for indexing and
128     convert word positions of search results back into the text of the original
129     objects.
130     """
131     def __init__(self, recordsClass):
132         self.keeper = recordsClass()
133    
134     def userStartup(self):
135         AccessBroker.__init__(self, twisted=True)
136         d1 = self.table(
137             'words',
138             SA.Column('id', SA.Integer, index="word"),
139             SA.Column('word',
140                    SA.String(TRUNCATED_WORD_LENGTH),
141                    primary_key=True)
142             )
143         d2 = self.table(
144             'usage',
145             SA.Column('word_id', Integer, primary_key=True, index="scope"),
146             Column('doc_id', Integer, index="scope"),
147             Column('section_id', Integer, index="scope"),
148             Column('position', Integer, nullable=False)
149             )
150         self._ready = True
151         d3 = self.keeper.startup(self)
152         return defer.DeferredList([d1,d2,d3])
153
154     def busy(self, *args):
155         """
156         Indicates that indexing is in progress, which forces calls to my
157         L{search} method to queue up until I{ready} status resumes.
158         """
159         self._ready = False
160
161     def ready(self, *args):
162         """
163         Indicates that no indexing is in progress, which permits calls to my
164         L{search} method to start working on queries immediately
165         """
166         self._ready = True
167
168     def index(self, record, document=None, section=None):
169         """
170         Indexes the text content of the supplied I{record} under the supplied
171         I{document} and I{section} identifiers, which must be integers if
172         specified.
173
174         Returns a C{Deferred} that fires with no argument when the indexing is
175         done.
176
177         If no document is specified, the text is considered as being at the end
178         of whatever has already been indexed for a default document with the
179         identifier of zero. Likewise, every document (including the default)
180         has a default section, also with C{ID=0}, for indexing and searching of
181         records with no section specified.
182         """
183         return defer.succeed(None)
184
185     def drop(self, document, section=None):
186         """
187         Drops the index entries for the supplied I{document} and optionally
188         supplied I{section} identifies, which must be integers.
189
190         Returns a C{Deferred} that fires with no argument when the index update
191         is done.
192
193         If no section is specified, the index entries for the default document
194         will be dropped.
195         """
196         return defer.succeed(None)
197    
198     def search(self, query, scope=None):
199         """
200         Searches the record of the documents with IDs in the supplied I{scope}
201         sequence for matches with the supplied query. Items of B{all}
202         dictionaries are searched if no restriction on search scope is defined.
203
204         Returns a C{Deferred} that fires with the results of the search when it
205         is done. The results are passed to the callback as a list of tuples
206         C{(first, last, words, document, section)} that specify matching blocks
207         of text from each matching record. The tuple elements are, in order:
208
209             - B{first}: An integer specifying the position of the C{first} word
210               in the matching text block within the specified section of the
211               specified document.
212
213             - B{last}: An integer specifying the position of the C{last} word
214               in the matching text block within the specified section of the
215               specified document.
216
217             - B{words}: A list containing integer positions of words that
218               triggered the match.
219
220             - B{document}: An integer specifying the document in which the text
221               block was found.
222             
223             - B{section}: An integer specifying the section of the document in
224               which the text block was found.
225
226         @todo: Initially the query just contains terms that must all be present
227             in the item's text value, I{i.e.}, logical AND. Expand this method
228             to parse the query for proximity operators etc.
229
230         """
231         results = []
232         return defer.succeed(results)
233
234     def record(self, document, section=None, first=None, last=None):
235         """
236         Returns a C{Deferred} to the text content of the record for the
237         specified I{document}, optionally limited to a particular I{section}.
238
239         The text content to be returned can be restricted to a block of text
240         starting at a I{first} word and ending at a I{last} word, with the word
241         positions supplied as integer keywords.
242         """
243         pass
Note: See TracBrowser for help on using the browser.