@@ -30,7 +30,7 @@ from pylons import request, config, tmpl
from rhodecode.lib.auth import LoginRequired
from rhodecode.lib.base import BaseController, render
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, WhooshResultWrapper
from rhodecode.lib.indexers import CHGSETS_SCHEMA, SCHEMA, CHGSET_IDX_NAME, IDX_NAME, WhooshResultWrapper
from webhelpers.paginate import Page
from webhelpers.util import update_params
@@ -54,25 +54,41 @@ class SearchController(BaseController):
c.formated_results = []
c.runtime = ''
c.cur_query = request.GET.get('q', None)
c.cur_type = request.GET.get('type', 'source')
c.cur_type = request.GET.get('type', 'content')
c.cur_search = search_type = {'content': 'content',
'commit': 'content',
'commit': 'message',
'path': 'path',
'repository': 'repository'}\
.get(c.cur_type, 'content')
index_name = {
'content': IDX_NAME,
'commit': CHGSET_IDX_NAME,
'path': IDX_NAME}\
.get(c.cur_type, IDX_NAME)
schema_defn = {
'content': SCHEMA,
'commit': CHGSETS_SCHEMA,
'path': SCHEMA}\
.get(c.cur_type, SCHEMA)
log.debug('IDX: %s' % index_name)
log.debug('SCHEMA: %s' % schema_defn)
if c.cur_query:
cur_query = c.cur_query.lower()
log.debug(cur_query)
p = int(request.params.get('page', 1))
highlight_items = set()
try:
idx = open_dir(config['app_conf']['index_dir'],
indexname=IDX_NAME)
indexname=index_name)
searcher = idx.searcher()
qp = QueryParser(search_type, schema=SCHEMA)
qp = QueryParser(search_type, schema=schema_defn)
if c.repo_name:
cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
@@ -84,13 +100,13 @@ class SearchController(BaseController):
highlight_items.add(query.text)
else:
for i in query.all_terms():
if i[0] == 'content':
if i[0] in ['content', 'message']:
highlight_items.add(i[1])
matcher = query.matcher(searcher)
log.debug(query)
log.debug(highlight_items)
log.debug('query: %s' % query)
log.debug('hl terms: %s' % highlight_items)
results = searcher.search(query)
res_ln = len(results)
c.runtime = '%s results (%.3f seconds)' % (
@@ -99,7 +115,7 @@ class SearchController(BaseController):
def url_generator(**kw):
return update_params("?q=%s&type=%s" \
% (c.cur_query, c.cur_search), **kw)
% (c.cur_query, c.cur_type), **kw)
repo_location = RepoModel().repos_path
c.formated_results = Page(
WhooshResultWrapper(search_type, searcher, matcher,
@@ -35,7 +35,7 @@ from string import strip
from shutil import rmtree
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType
from whoosh.index import create_in, open_dir
from whoosh.formats import Characters
from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
@@ -51,10 +51,11 @@ from rhodecode.lib.utils2 import LazyPro
from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
load_rcextensions
log = logging.getLogger(__name__)
# CUSTOM ANALYZER wordsplit + lowercase filter
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
#INDEX SCHEMA DEFINITION
SCHEMA = Schema(
fileid=ID(unique=True),
@@ -71,6 +72,21 @@ IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)
CHGSETS_SCHEMA = Schema(
raw_id=ID(unique=True, stored=True),
last=BOOLEAN(),
owner=TEXT(),
repository=ID(unique=True, stored=True),
author=TEXT(stored=True),
message=FieldType(format=Characters(), analyzer=ANALYZER,
scorable=True, stored=True),
parents=TEXT(),
added=TEXT(),
removed=TEXT(),
changed=TEXT(),
)
CHGSET_IDX_NAME = 'CHGSET_INDEX'
class MakeIndex(BasePasterCommand):
@@ -191,14 +207,20 @@ class WhooshResultWrapper(object):
def get_full_content(self, docid):
res = self.searcher.stored_fields(docid[0])
log.debug('result: %s' % res)
if self.search_type == 'content':
full_repo_path = jn(self.repo_location, res['repository'])
f_path = res['path'].split(full_repo_path)[-1]
f_path = f_path.lstrip(os.sep)
content_short = self.get_short_content(res, docid[1])
res.update({'content_short': content_short,
'content_short_hl': self.highlight(content_short),
'f_path': f_path})
'f_path': f_path
})
elif self.search_type == 'message':
res.update({'message_hl': self.highlight(res['message'])})
return res
@@ -216,6 +238,7 @@ class WhooshResultWrapper(object):
:param size:
"""
memory = [(0, 0)]
if self.matcher.supports('positions'):
for span in self.matcher.spans():
start = span.startchar or 0
end = span.endchar or 0
@@ -228,7 +251,7 @@ class WhooshResultWrapper(object):
yield (start_offseted, end_offseted,)
def highlight(self, content, top=5):
if self.search_type != 'content':
if self.search_type not in ['content', 'message']:
return ''
hl = highlight(
text=content,
@@ -22,6 +22,7 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import with_statement
import os
import sys
@@ -41,12 +42,14 @@ sys.path.append(project_path)
from rhodecode.config.conf import INDEX_EXTENSIONS
from rhodecode.model.scm import ScmModel
from rhodecode.lib.utils2 import safe_unicode
from rhodecode.lib.indexers import SCHEMA, IDX_NAME
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, CHGSET_IDX_NAME
from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
NodeDoesNotExistError
from whoosh.index import create_in, open_dir, exists_in
from whoosh.query import *
from whoosh.qparser import QueryParser
log = logging.getLogger('whoosh_indexer')
@@ -89,12 +92,19 @@ class WhooshIndexingDaemon(object):
self.filtered_repo_update_paths[repo_name] = repo
self.repo_paths = self.filtered_repo_update_paths
self.initial = False
self.initial = True
if not os.path.isdir(self.index_location):
os.makedirs(self.index_location)
log.info('Cannot run incremental index since it does not'
' yet exist running full build')
elif not exists_in(self.index_location, IDX_NAME):
log.info('Running full index build as the file content'
' index does not exist')
elif not exists_in(self.index_location, CHGSET_IDX_NAME):
log.info('Running full index build as the changeset'
def get_paths(self, repo):
@@ -158,35 +168,103 @@ class WhooshIndexingDaemon(object):
return indexed, indexed_w_content
def build_index(self):
if os.path.exists(self.index_location):
log.debug('removing previous index')
rmtree(self.index_location)
def index_changesets(self, writer, repo_name, repo, start_rev=None):
Add all changeset in the vcs repo starting at start_rev
to the index writer
if not os.path.exists(self.index_location):
os.mkdir(self.index_location)
:param writer: the whoosh index writer to add to
:param repo_name: name of the repository from whence the
changeset originates including the repository group
:param repo: the vcs repository instance to index changesets for,
the presumption is the repo has changesets to index
:param start_rev=None: the full sha id to start indexing from
if start_rev is None then index from the first changeset in
the repo
if start_rev is None:
start_rev = repo[0].raw_id
log.debug('indexing changesets in %s starting at rev: %s' % (repo_name, start_rev))
idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
writer = idx.writer()
log.debug('BUILDING INDEX FOR EXTENSIONS %s '
'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
indexed=0
for cs in repo.get_changesets(start=start_rev):
writer.add_document(
raw_id=unicode(cs.raw_id),
owner=unicode(repo.contact),
repository=safe_unicode(repo_name),
author=cs.author,
message=cs.message,
last=cs.last,
added=u' '.join([node.path for node in cs.added]).lower(),
removed=u' '.join([node.path for node in cs.removed]).lower(),
changed=u' '.join([node.path for node in cs.changed]).lower(),
parents=u' '.join([cs.raw_id for cs in cs.parents]),
indexed += 1
for repo_name, repo in self.repo_paths.items():
log.debug('building index @ %s' % repo.path)
log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
def index_files(self, file_idx_writer, repo_name, repo):
i_cnt = iwc_cnt = 0
log.debug('building index for [%s]' % repo.path)
for idx_path in self.get_paths(repo):
i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
i_cnt += i
iwc_cnt += iwc
log.debug('added %s files %s with content for repo %s' % (
i_cnt + iwc_cnt, iwc_cnt, repo.path)
log.debug('added %s files %s with content for repo %s' % (i_cnt + iwc_cnt, iwc_cnt, repo.path))
def update_changeset_index(self):
idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
with idx.searcher() as searcher:
writer_is_dirty = False
# skip indexing if there aren't any revs in the repo
num_of_revs = len(repo)
if num_of_revs < 1:
continue
qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
q = qp.parse(u"last:t AND %s" % repo_name)
results = searcher.search(q)
# default to scanning the entire repo
last_rev = 0
start_id = None
log.debug('>> COMMITING CHANGES <<')
if len(results) > 0:
# assuming that there is only one result, if not this
# may require a full re-index.
start_id = results[0]['raw_id']
last_rev = repo.get_changeset(revision=start_id).revision
# there are new changesets to index or a new repo to index
if last_rev == 0 or num_of_revs > last_rev + 1:
# delete the docs in the index for the previous last changeset(s)
for hit in results:
q = qp.parse(u"last:t AND %s AND raw_id:%s" %
(repo_name, hit['raw_id']))
writer.delete_by_query(q)
# index from the previous last changeset + all new ones
self.index_changesets(writer, repo_name, repo, start_id)
writer_is_dirty = True
finally:
if writer_is_dirty:
log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
writer.commit(merge=True)
log.debug('>>> FINISHED BUILDING INDEX <<<')
log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')
writer.cancel
def update_index(self):
def update_file_index(self):
log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
@@ -196,8 +274,10 @@ class WhooshIndexingDaemon(object):
# The set of all paths we need to re-index
to_index = set()
reader = idx.reader()
with idx.reader() as reader:
# Loop over the stored fields in the index
for fields in reader.all_stored_fields():
@@ -222,12 +302,14 @@ class WhooshIndexingDaemon(object):
indexed_path, mtime, indexed_time)
writer.delete_by_term('fileid', indexed_path)
to_index.add(indexed_path)
except (ChangesetError, NodeDoesNotExistError):
# This file was deleted since it was indexed
log.debug('removing from index %s' % indexed_path)
writer.delete_by_term('path', indexed_path)
# Loop over the files in the filesystem
# Assume we have a function that gathers the filenames of the
@@ -235,6 +317,9 @@ class WhooshIndexingDaemon(object):
ri_cnt_total = 0 # indexed
riwc_cnt_total = 0 # indexed with content
# skip indexing if there aren't any revisions
if len(repo) < 1:
ri_cnt = 0 # indexed
riwc_cnt = 0 # indexed with content
for path in self.get_paths(repo):
@@ -244,6 +329,7 @@ class WhooshIndexingDaemon(object):
# This is either a file that's changed, or a new file
# that wasn't indexed before. So index it!
i, iwc = self.add_doc(writer, path, repo, repo_name)
log.debug('re indexing %s' % path)
ri_cnt += i
ri_cnt_total += 1
@@ -255,13 +341,50 @@ class WhooshIndexingDaemon(object):
log.debug('indexed %s files in total and %s with content' % (
ri_cnt_total, riwc_cnt_total)
log.debug('>>> FINISHED REBUILDING INDEX <<<')
writer.cancel()
def build_indexes(self):
chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME)
chgset_idx_writer = chgset_idx.writer()
file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
file_idx_writer = file_idx.writer()
self.index_files(file_idx_writer, repo_name, repo)
self.index_changesets(chgset_idx_writer, repo_name, repo)
file_idx_writer.commit(merge=True)
chgset_idx_writer.commit(merge=True)
def update_indexes(self):
self.update_file_index()
self.update_changeset_index()
def run(self, full_index=False):
"""Run daemon"""
if full_index or self.initial:
self.build_index()
self.build_indexes()
self.update_index()
self.update_indexes()
@@ -61,7 +61,7 @@
</div>
<div class="select">
${h.select('type',c.cur_type,[('content',_('File contents')),
##('commit',_('Commit messages')),
('commit',_('Commit messages')),
('path',_('File names')),
##('repository',_('Repository names')),
])}
@@ -72,13 +72,13 @@
${h.end_form()}
<div class="search">
%if c.cur_search == 'content':
%if c.cur_type == 'content':
<%include file='search_content.html'/>
%elif c.cur_search == 'path':
%elif c.cur_type == 'path':
<%include file='search_path.html'/>
%elif c.cur_search == 'commit':
%elif c.cur_type == 'commit':
<%include file='search_commit.html'/>
%elif c.cur_search == 'repository':
%elif c.cur_type == 'repository':
<%include file='search_repository.html'/>
%endif
##commit highligthing
%for cnt,sr in enumerate(c.formated_results):
%if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
<div class="table">
<div id="body${cnt}" class="codeblock">
<div class="code-header">
<div class="search-path">${h.link_to(h.literal('%s » %s' % (sr['repository'],sr['raw_id'])),
h.url('changeset_home',repo_name=sr['repository'],revision=sr['raw_id']))}
<div class="left">
<div class="author">
<div class="gravatar">
<img alt="gravatar" src="${h.gravatar_url(h.email(sr['author']),20)}"/>
<span>${h.person(sr['author'])}</span><br/>
<span><a href="mailto:${h.email_or_none(sr['author'])}">${h.email_or_none(sr['author'])}</a></span><br/>
%if sr['message_hl']:
<div class="search-code-body">
<pre>${h.literal(sr['message_hl'])}</pre>
%else:
<div class="message">${h.urlify_commit(sr['message'], sr['repository'])}</div>
%if cnt == 0:
<div class="error">${_('Permission denied')}</div>
%endfor
%if c.cur_query and c.formated_results:
<div class="pagination-wh pagination-left">
${c.formated_results.pager('$link_previous ~2~ $link_next')}
@@ -35,3 +35,58 @@ class TestSearchController(TestControlle
{'q': 'repository:%s def test' % HG_REPO})
response.mustcontain('4 results')
def test_search_last(self):
self.log_user()
response = self.app.get(url(controller='search', action='index'),
{'q': 'last:t', 'type': 'commit'})
response.mustcontain('2 results')
def test_search_commit_message(self):
{'q': 'bother to ask where to fetch repo during tests',
'type': 'commit'})
response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
response.mustcontain('c6eb379775c578a95dad8ddab53f963b80894850')
def test_search_commit_message_hg_repo(self):
response = self.app.get(url(controller='search', action='index',
search_repo=HG_REPO),
response.mustcontain('1 results')
def test_search_commit_changed_file(self):
{'q': 'changed:tests/utils.py',
def test_search_commit_added_file(self):
{'q': 'added:README.rst',
#HG
response.mustcontain('3803844fdbd3b711175fc3da9bdacfcd6d29a6fb')
#GIT
response.mustcontain('ff7ca51e58c505fec0dd2491de52c622bb7a806b')
def test_search_author(self):
{'q': 'author:marcin@python-blog.com raw_id:b986218ba1c9b0d6a259fac9b050b1724ed8e545',
Status change: