@@ -30,7 +30,7 @@ from pylons import request, config, tmpl
from rhodecode.lib.auth import LoginRequired
from rhodecode.lib.base import BaseController, render
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, WhooshResultWrapper
from rhodecode.lib.indexers import CHGSETS_SCHEMA, SCHEMA, CHGSET_IDX_NAME, IDX_NAME, WhooshResultWrapper
from webhelpers.paginate import Page
from webhelpers.util import update_params
@@ -54,25 +54,41 @@ class SearchController(BaseController):
c.formated_results = []
c.runtime = ''
c.cur_query = request.GET.get('q', None)
c.cur_type = request.GET.get('type', 'source')
c.cur_type = request.GET.get('type', 'content')
c.cur_search = search_type = {'content': 'content',
'commit': 'content',
'commit': 'message',
'path': 'path',
'repository': 'repository'}\
.get(c.cur_type, 'content')
index_name = {
'content': IDX_NAME,
'commit': CHGSET_IDX_NAME,
'path': IDX_NAME}\
.get(c.cur_type, IDX_NAME)
schema_defn = {
'content': SCHEMA,
'commit': CHGSETS_SCHEMA,
'path': SCHEMA}\
.get(c.cur_type, SCHEMA)
log.debug('IDX: %s' % index_name)
log.debug('SCHEMA: %s' % schema_defn)
if c.cur_query:
cur_query = c.cur_query.lower()
log.debug(cur_query)
p = int(request.params.get('page', 1))
highlight_items = set()
try:
idx = open_dir(config['app_conf']['index_dir'],
indexname=IDX_NAME)
indexname=index_name)
searcher = idx.searcher()
qp = QueryParser(search_type, schema=SCHEMA)
qp = QueryParser(search_type, schema=schema_defn)
if c.repo_name:
cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
@@ -84,13 +100,13 @@ class SearchController(BaseController):
highlight_items.add(query.text)
else:
for i in query.all_terms():
if i[0] == 'content':
if i[0] in ['content', 'message']:
highlight_items.add(i[1])
matcher = query.matcher(searcher)
log.debug(query)
log.debug(highlight_items)
log.debug('query: %s' % query)
log.debug('hl terms: %s' % highlight_items)
results = searcher.search(query)
res_ln = len(results)
c.runtime = '%s results (%.3f seconds)' % (
@@ -99,7 +115,7 @@ class SearchController(BaseController):
def url_generator(**kw):
return update_params("?q=%s&type=%s" \
% (c.cur_query, c.cur_search), **kw)
% (c.cur_query, c.cur_type), **kw)
repo_location = RepoModel().repos_path
c.formated_results = Page(
WhooshResultWrapper(search_type, searcher, matcher,
@@ -35,7 +35,7 @@ from string import strip
from shutil import rmtree
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType
from whoosh.index import create_in, open_dir
from whoosh.formats import Characters
from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
@@ -51,10 +51,11 @@ from rhodecode.lib.utils2 import LazyPro
from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
load_rcextensions
log = logging.getLogger(__name__)
# CUSTOM ANALYZER wordsplit + lowercase filter
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
#INDEX SCHEMA DEFINITION
SCHEMA = Schema(
fileid=ID(unique=True),
@@ -71,6 +72,21 @@ IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)
CHGSETS_SCHEMA = Schema(
raw_id=ID(unique=True, stored=True),
last=BOOLEAN(),
owner=TEXT(),
repository=ID(unique=True, stored=True),
author=TEXT(stored=True),
message=FieldType(format=Characters(), analyzer=ANALYZER,
scorable=True, stored=True),
parents=TEXT(),
added=TEXT(),
removed=TEXT(),
changed=TEXT(),
)
CHGSET_IDX_NAME = 'CHGSET_INDEX'
class MakeIndex(BasePasterCommand):
@@ -191,14 +207,20 @@ class WhooshResultWrapper(object):
def get_full_content(self, docid):
res = self.searcher.stored_fields(docid[0])
full_repo_path = jn(self.repo_location, res['repository'])
f_path = res['path'].split(full_repo_path)[-1]
f_path = f_path.lstrip(os.sep)
log.debug('result: %s' % res)
if self.search_type == 'content':
content_short = self.get_short_content(res, docid[1])
res.update({'content_short': content_short,
'content_short_hl': self.highlight(content_short),
'f_path': f_path
})
elif self.search_type == 'message':
res.update({'message_hl': self.highlight(res['message'])})
'f_path': f_path})
return res
@@ -216,19 +238,20 @@ class WhooshResultWrapper(object):
:param size:
"""
memory = [(0, 0)]
for span in self.matcher.spans():
start = span.startchar or 0
end = span.endchar or 0
start_offseted = max(0, start - self.fragment_size)
end_offseted = end + self.fragment_size
if self.matcher.supports('positions'):
if start_offseted < memory[-1][1]:
start_offseted = memory[-1][1]
memory.append((start_offseted, end_offseted,))
yield (start_offseted, end_offseted,)
def highlight(self, content, top=5):
if self.search_type != 'content':
if self.search_type not in ['content', 'message']:
return ''
hl = highlight(
text=content,
@@ -22,6 +22,7 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import with_statement
import os
import sys
@@ -41,12 +42,14 @@ sys.path.append(project_path)
from rhodecode.config.conf import INDEX_EXTENSIONS
from rhodecode.model.scm import ScmModel
from rhodecode.lib.utils2 import safe_unicode
from rhodecode.lib.indexers import SCHEMA, IDX_NAME
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, CHGSET_IDX_NAME
from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
NodeDoesNotExistError
from whoosh.index import create_in, open_dir, exists_in
from whoosh.query import *
from whoosh.qparser import QueryParser
log = logging.getLogger('whoosh_indexer')
@@ -89,12 +92,19 @@ class WhooshIndexingDaemon(object):
self.filtered_repo_update_paths[repo_name] = repo
self.repo_paths = self.filtered_repo_update_paths
self.initial = False
self.initial = True
if not os.path.isdir(self.index_location):
os.makedirs(self.index_location)
log.info('Cannot run incremental index since it does not'
' yet exist running full build')
elif not exists_in(self.index_location, IDX_NAME):
log.info('Running full index build as the file content'
' index does not exist')
elif not exists_in(self.index_location, CHGSET_IDX_NAME):
log.info('Running full index build as the changeset'
def get_paths(self, repo):
@@ -158,35 +168,103 @@ class WhooshIndexingDaemon(object):
return indexed, indexed_w_content
def build_index(self):
if os.path.exists(self.index_location):
log.debug('removing previous index')
rmtree(self.index_location)
def index_changesets(self, writer, repo_name, repo, start_rev=None):
Add all changeset in the vcs repo starting at start_rev
to the index writer
:param writer: the whoosh index writer to add to
:param repo_name: name of the repository from whence the
changeset originates including the repository group
:param repo: the vcs repository instance to index changesets for,
the presumption is the repo has changesets to index
:param start_rev=None: the full sha id to start indexing from
if start_rev is None then index from the first changeset in
the repo
if start_rev is None:
start_rev = repo[0].raw_id
log.debug('indexing changesets in %s starting at rev: %s' % (repo_name, start_rev))
if not os.path.exists(self.index_location):
os.mkdir(self.index_location)
indexed=0
for cs in repo.get_changesets(start=start_rev):
writer.add_document(
raw_id=unicode(cs.raw_id),
owner=unicode(repo.contact),
repository=safe_unicode(repo_name),
author=cs.author,
message=cs.message,
last=cs.last,
added=u' '.join([node.path for node in cs.added]).lower(),
removed=u' '.join([node.path for node in cs.removed]).lower(),
changed=u' '.join([node.path for node in cs.changed]).lower(),
parents=u' '.join([cs.raw_id for cs in cs.parents]),
indexed += 1
idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
writer = idx.writer()
log.debug('BUILDING INDEX FOR EXTENSIONS %s '
'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
def index_files(self, file_idx_writer, repo_name, repo):
i_cnt = iwc_cnt = 0
log.debug('building index for [%s]' % repo.path)
for idx_path in self.get_paths(repo):
i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
i_cnt += i
iwc_cnt += iwc
log.debug('added %s files %s with content for repo %s' % (i_cnt + iwc_cnt, iwc_cnt, repo.path))
for repo_name, repo in self.repo_paths.items():
log.debug('building index @ %s' % repo.path)
i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
log.debug('added %s files %s with content for repo %s' % (
i_cnt + iwc_cnt, iwc_cnt, repo.path)
def update_changeset_index(self):
idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
with idx.searcher() as searcher:
writer_is_dirty = False
# skip indexing if there aren't any revs in the repo
num_of_revs = len(repo)
if num_of_revs < 1:
continue
qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
q = qp.parse(u"last:t AND %s" % repo_name)
results = searcher.search(q)
# default to scanning the entire repo
last_rev = 0
start_id = None
log.debug('>> COMMITING CHANGES <<')
writer.commit(merge=True)
log.debug('>>> FINISHED BUILDING INDEX <<<')
if len(results) > 0:
# assuming that there is only one result, if not this
# may require a full re-index.
start_id = results[0]['raw_id']
last_rev = repo.get_changeset(revision=start_id).revision
# there are new changesets to index or a new repo to index
if last_rev == 0 or num_of_revs > last_rev + 1:
# delete the docs in the index for the previous last changeset(s)
for hit in results:
q = qp.parse(u"last:t AND %s AND raw_id:%s" %
(repo_name, hit['raw_id']))
writer.delete_by_query(q)
def update_index(self):
# index from the previous last changeset + all new ones
self.index_changesets(writer, repo_name, repo, start_id)
writer_is_dirty = True
finally:
if writer_is_dirty:
log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')
writer.cancel
def update_file_index(self):
log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
@@ -196,72 +274,117 @@ class WhooshIndexingDaemon(object):
# The set of all paths we need to re-index
to_index = set()
reader = idx.reader()
with idx.reader() as reader:
# Loop over the stored fields in the index
for fields in reader.all_stored_fields():
indexed_path = fields['path']
indexed_repo_path = fields['repository']
indexed_paths.add(indexed_path)
if not indexed_repo_path in self.filtered_repo_update_paths:
repo = self.repo_paths[indexed_repo_path]
node = self.get_node(repo, indexed_path)
# Check if this file was changed since it was indexed
indexed_time = fields['modtime']
mtime = self.get_node_mtime(node)
if mtime > indexed_time:
# The file has changed, delete it and add it to the list of
# files to reindex
log.debug('adding to reindex list %s mtime: %s vs %s' % (
indexed_path, mtime, indexed_time)
writer.delete_by_term('fileid', indexed_path)
to_index.add(indexed_path)
except (ChangesetError, NodeDoesNotExistError):
# This file was deleted since it was indexed
log.debug('removing from index %s' % indexed_path)
writer.delete_by_term('path', indexed_path)
# Loop over the files in the filesystem
# Assume we have a function that gathers the filenames of the
# documents to be indexed
ri_cnt_total = 0 # indexed
riwc_cnt_total = 0 # indexed with content
# skip indexing if there aren't any revisions
if len(repo) < 1:
ri_cnt = 0 # indexed
riwc_cnt = 0 # indexed with content
for path in self.get_paths(repo):
path = safe_unicode(path)
if path in to_index or path not in indexed_paths:
# This is either a file that's changed, or a new file
# that wasn't indexed before. So index it!
i, iwc = self.add_doc(writer, path, repo, repo_name)
log.debug('re indexing %s' % path)
ri_cnt += i
ri_cnt_total += 1
riwc_cnt += iwc
riwc_cnt_total += iwc
ri_cnt + riwc_cnt, riwc_cnt, repo.path)
log.debug('indexed %s files in total and %s with content' % (
ri_cnt_total, riwc_cnt_total)
log.debug('>>> FINISHED REBUILDING INDEX <<<')
writer.cancel()
def build_indexes(self):
chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME)
chgset_idx_writer = chgset_idx.writer()
file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
file_idx_writer = file_idx.writer()
self.index_files(file_idx_writer, repo_name, repo)
self.index_changesets(chgset_idx_writer, repo_name, repo)
file_idx_writer.commit(merge=True)
chgset_idx_writer.commit(merge=True)
def update_indexes(self):
self.update_file_index()
self.update_changeset_index()
def run(self, full_index=False):
"""Run daemon"""
if full_index or self.initial:
self.build_index()
self.build_indexes()
self.update_index()
self.update_indexes()
@@ -61,7 +61,7 @@
</div>
<div class="select">
${h.select('type',c.cur_type,[('content',_('File contents')),
##('commit',_('Commit messages')),
('commit',_('Commit messages')),
('path',_('File names')),
##('repository',_('Repository names')),
])}
@@ -72,13 +72,13 @@
${h.end_form()}
<div class="search">
%if c.cur_search == 'content':
%if c.cur_type == 'content':
<%include file='search_content.html'/>
%elif c.cur_search == 'path':
%elif c.cur_type == 'path':
<%include file='search_path.html'/>
%elif c.cur_search == 'commit':
%elif c.cur_type == 'commit':
<%include file='search_commit.html'/>
%elif c.cur_search == 'repository':
%elif c.cur_type == 'repository':
<%include file='search_repository.html'/>
%endif
##commit highligthing
%for cnt,sr in enumerate(c.formated_results):
%if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
<div class="table">
<div id="body${cnt}" class="codeblock">
<div class="code-header">
<div class="search-path">${h.link_to(h.literal('%s » %s' % (sr['repository'],sr['raw_id'])),
h.url('changeset_home',repo_name=sr['repository'],revision=sr['raw_id']))}
<div class="left">
<div class="author">
<div class="gravatar">
<img alt="gravatar" src="${h.gravatar_url(h.email(sr['author']),20)}"/>
<span>${h.person(sr['author'])}</span><br/>
<span><a href="mailto:${h.email_or_none(sr['author'])}">${h.email_or_none(sr['author'])}</a></span><br/>
%if sr['message_hl']:
<div class="search-code-body">
<pre>${h.literal(sr['message_hl'])}</pre>
%else:
<div class="message">${h.urlify_commit(sr['message'], sr['repository'])}</div>
%if cnt == 0:
<div class="error">${_('Permission denied')}</div>
%endfor
%if c.cur_query and c.formated_results:
<div class="pagination-wh pagination-left">
${c.formated_results.pager('$link_previous ~2~ $link_next')}
@@ -35,3 +35,58 @@ class TestSearchController(TestControlle
{'q': 'repository:%s def test' % HG_REPO})
response.mustcontain('4 results')
def test_search_last(self):
self.log_user()
response = self.app.get(url(controller='search', action='index'),
{'q': 'last:t', 'type': 'commit'})
response.mustcontain('2 results')
def test_search_commit_message(self):
{'q': 'bother to ask where to fetch repo during tests',
'type': 'commit'})
response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
response.mustcontain('c6eb379775c578a95dad8ddab53f963b80894850')
def test_search_commit_message_hg_repo(self):
response = self.app.get(url(controller='search', action='index',
search_repo=HG_REPO),
response.mustcontain('1 results')
def test_search_commit_changed_file(self):
{'q': 'changed:tests/utils.py',
def test_search_commit_added_file(self):
{'q': 'added:README.rst',
#HG
response.mustcontain('3803844fdbd3b711175fc3da9bdacfcd6d29a6fb')
#GIT
response.mustcontain('ff7ca51e58c505fec0dd2491de52c622bb7a806b')
def test_search_author(self):
{'q': 'author:marcin@python-blog.com raw_id:b986218ba1c9b0d6a259fac9b050b1724ed8e545',
Status change: