@@ -20,48 +20,49 @@
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import with_statement
import os
import sys
import logging
import traceback
from shutil import rmtree
from time import mktime
from os.path import dirname as dn
from os.path import join as jn
#to get the rhodecode import
project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
sys.path.append(project_path)
from rhodecode.config.conf import INDEX_EXTENSIONS
from rhodecode.model.scm import ScmModel
from rhodecode.model.db import Repository
from rhodecode.lib.utils2 import safe_unicode, safe_str
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
CHGSET_IDX_NAME
from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
NodeDoesNotExistError
from whoosh.index import create_in, open_dir, exists_in
from whoosh.query import *
from whoosh.qparser import QueryParser
log = logging.getLogger('whoosh_indexer')
class WhooshIndexingDaemon(object):
"""
Daemon for atomic indexing jobs
def __init__(self, indexname=IDX_NAME, index_location=None,
repo_location=None, sa=None, repo_list=None,
repo_update_list=None):
self.indexname = indexname
@@ -77,79 +78,92 @@ class WhooshIndexingDaemon(object):
#filter repo list
if repo_list:
#Fix non-ascii repo names to unicode
repo_list = map(safe_unicode, repo_list)
self.filtered_repo_paths = {}
for repo_name, repo in self.repo_paths.items():
if repo_name in repo_list:
self.filtered_repo_paths[repo_name] = repo
self.repo_paths = self.filtered_repo_paths
#filter update repo list
self.filtered_repo_update_paths = {}
if repo_update_list:
if repo_name in repo_update_list:
self.filtered_repo_update_paths[repo_name] = repo
self.repo_paths = self.filtered_repo_update_paths
self.initial = True
if not os.path.isdir(self.index_location):
os.makedirs(self.index_location)
log.info('Cannot run incremental index since it does not'
' yet exist running full build')
log.info('Cannot run incremental index since it does not '
'yet exist running full build')
elif not exists_in(self.index_location, IDX_NAME):
log.info('Running full index build as the file content'
' index does not exist')
log.info('Running full index build as the file content '
'index does not exist')
elif not exists_in(self.index_location, CHGSET_IDX_NAME):
log.info('Running full index build as the changeset'
log.info('Running full index build as the changeset '
else:
self.initial = False
def _get_index_revision(self, repo):
db_repo = Repository.get_by_repo_name(repo.name)
landing_rev = 'tip'
if db_repo:
landing_rev = db_repo.landing_rev
return landing_rev
def _get_index_changeset(self, repo):
index_rev = self._get_index_revision(repo)
cs = repo.get_changeset(index_rev)
return cs
def get_paths(self, repo):
recursive walk in root dir and return a set of all path in that dir
based on repository walk function
index_paths_ = set()
try:
tip = repo.get_changeset('tip')
for _topnode, _dirs, files in tip.walk('/'):
cs = self._get_index_changeset(repo)
for _topnode, _dirs, files in cs.walk('/'):
for f in files:
index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
except RepositoryError:
log.debug(traceback.format_exc())
pass
return index_paths_
def get_node(self, repo, path):
n_path = path[len(repo.path) + 1:]
node = repo.get_changeset().get_node(n_path)
node = cs.get_node(n_path)
return node
def get_node_mtime(self, node):
return mktime(node.last_changeset.date.timetuple())
def add_doc(self, writer, path, repo, repo_name):
Adding doc to writer this function itself fetches data from
the instance of vcs backend
node = self.get_node(repo, path)
indexed = indexed_w_content = 0
# we just index the content of chosen files, and skip binary files
if node.extension in INDEX_EXTENSIONS and not node.is_binary:
u_content = node.content
if not isinstance(u_content, unicode):
log.warning(' >> %s Could not get this content as unicode '
'replacing with empty content' % path)
u_content = u''
log.debug(' >> %s [WITH CONTENT]' % path)
indexed_w_content += 1
@@ -201,49 +215,50 @@ class WhooshIndexingDaemon(object):
date=cs._timestamp,
repository=safe_unicode(repo_name),
author=cs.author,
message=cs.message,
last=cs.last,
added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
parents=u' '.join([cs.raw_id for cs in cs.parents]),
)
indexed += 1
log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
return indexed
def index_files(self, file_idx_writer, repo_name, repo):
Index files for given repo_name
:param file_idx_writer: the whoosh index writer to add to
:param repo_name: name of the repository we're indexing
:param repo: instance of vcs repo
i_cnt = iwc_cnt = 0
log.debug('building index for [%s]' % repo.path)
log.debug('building index for %s @revision:%s' % (repo.path,
self._get_index_revision(repo)))
for idx_path in self.get_paths(repo):
i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
i_cnt += i
iwc_cnt += iwc
log.debug('added %s files %s with content for repo %s' %
(i_cnt + iwc_cnt, iwc_cnt, repo.path))
return i_cnt, iwc_cnt
def update_changeset_index(self):
idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
with idx.searcher() as searcher:
writer = idx.writer()
writer_is_dirty = False
indexed_total = 0
repo_name = None
# skip indexing if there aren't any revs in the repo
num_of_revs = len(repo)
if num_of_revs < 1:
continue
@@ -255,57 +270,56 @@ class WhooshIndexingDaemon(object):
# default to scanning the entire repo
last_rev = 0
start_id = None
if len(results) > 0:
# assuming that there is only one result, if not this
# may require a full re-index.
start_id = results[0]['raw_id']
last_rev = repo.get_changeset(revision=start_id).revision
# there are new changesets to index or a new repo to index
if last_rev == 0 or num_of_revs > last_rev + 1:
# delete the docs in the index for the previous
# last changeset(s)
for hit in results:
q = qp.parse(u"last:t AND %s AND raw_id:%s" %
(repo_name, hit['raw_id']))
writer.delete_by_query(q)
# index from the previous last changeset + all new ones
indexed_total += self.index_changesets(writer,
repo_name, repo, start_id)
writer_is_dirty = True
log.debug('indexed %s changesets for repo %s' % (
indexed_total, repo_name)
finally:
if writer_is_dirty:
log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
writer.commit(merge=True)
log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
writer.cancel
log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
def update_file_index(self):
log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
idx = open_dir(self.index_location, indexname=self.indexname)
# The set of all paths in the index
indexed_paths = set()
# The set of all paths we need to re-index
to_index = set()
with idx.reader() as reader:
# Loop over the stored fields in the index
for fields in reader.all_stored_fields():
indexed_path = fields['path']
indexed_repo_path = fields['repository']
indexed_paths.add(indexed_path)
if not indexed_repo_path in self.filtered_repo_update_paths:
Status change: