@@ -77,194 +77,198 @@ class WhooshIndexingDaemon(object):
raise Exception('You have to provide repositories location')
self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
#filter repo list
if repo_list:
#Fix non-ascii repo names to unicode
repo_list = map(safe_unicode, repo_list)
self.filtered_repo_paths = {}
for repo_name, repo in self.repo_paths.items():
if repo_name in repo_list:
self.filtered_repo_paths[repo_name] = repo
self.repo_paths = self.filtered_repo_paths
#filter update repo list
self.filtered_repo_update_paths = {}
if repo_update_list:
if repo_name in repo_update_list:
self.filtered_repo_update_paths[repo_name] = repo
self.repo_paths = self.filtered_repo_update_paths
self.initial = True
if not os.path.isdir(self.index_location):
os.makedirs(self.index_location)
log.info('Cannot run incremental index since it does not '
'yet exist running full build')
elif not exists_in(self.index_location, IDX_NAME):
log.info('Running full index build as the file content '
'index does not exist')
elif not exists_in(self.index_location, CHGSET_IDX_NAME):
log.info('Running full index build as the changeset '
else:
self.initial = False
def _get_index_revision(self, repo):
db_repo = Repository.get_by_repo_name(repo.name_unicode)
landing_rev = 'tip'
if db_repo:
_rev_type, _rev = db_repo.landing_rev
landing_rev = _rev
return landing_rev
def _get_index_changeset(self, repo, index_rev=None):
if not index_rev:
index_rev = self._get_index_revision(repo)
cs = repo.get_changeset(index_rev)
return cs
def get_paths(self, repo):
"""
recursive walk in root dir and return a set of all path in that dir
based on repository walk function
index_paths_ = set()
try:
cs = self._get_index_changeset(repo)
for _topnode, _dirs, files in cs.walk('/'):
for f in files:
index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
except RepositoryError:
log.debug(traceback.format_exc())
pass
return index_paths_
def get_node(self, repo, path, index_rev=None):
gets a filenode based on given full path. It operates on string for
hg git compatibility.
:param repo: scm repo instance
:param path: full path including root location
:return: FileNode
# FIXME: paths should be normalized ... or even better: don't include repo.path
path = safe_str(path)
repo_path = safe_str(repo.path)
assert path.startswith(repo_path)
assert path[len(repo_path)] in (os.path.sep, os.path.altsep)
node_path = path[len(repo_path) + 1:]
cs = self._get_index_changeset(repo, index_rev=index_rev)
node = cs.get_node(node_path)
return node
def get_node_mtime(self, node):
return mktime(node.last_changeset.date.timetuple())
def add_doc(self, writer, path, repo, repo_name, index_rev=None):
Adding doc to writer this function itself fetches data from
the instance of vcs backend
node = self.get_node(repo, path, index_rev)
except (ChangesetError, NodeDoesNotExistError):
log.debug("couldn't add doc - %s did not have %r at %s", repo, path, index_rev)
return 0, 0
indexed = indexed_w_content = 0
# we just index the content of chosen files, and skip binary files
if node.extension in INDEX_EXTENSIONS and not node.is_binary:
u_content = node.content
if not isinstance(u_content, unicode):
log.warning(' >> %s Could not get this content as unicode '
'replacing with empty content' % path)
u_content = u''
log.debug(' >> %s [WITH CONTENT]', path)
indexed_w_content += 1
log.debug(' >> %s', path)
# just index file name without it's content
indexed += 1
p = safe_unicode(path)
writer.add_document(
fileid=p,
owner=unicode(repo.contact),
repository=safe_unicode(repo_name),
path=p,
content=u_content,
modtime=self.get_node_mtime(node),
extension=node.extension
)
return indexed, indexed_w_content
def index_changesets(self, writer, repo_name, repo, start_rev=None):
Add all changeset in the vcs repo starting at start_rev
to the index writer
:param writer: the whoosh index writer to add to
:param repo_name: name of the repository from whence the
changeset originates including the repository group
:param repo: the vcs repository instance to index changesets for,
the presumption is the repo has changesets to index
:param start_rev=None: the full sha id to start indexing from
if start_rev is None then index from the first changeset in
the repo
if start_rev is None:
start_rev = repo[0].raw_id
log.debug('indexing changesets in %s starting at rev: %s',
repo_name, start_rev)
indexed = 0
cs_iter = repo.get_changesets(start=start_rev)
total = len(cs_iter)
for cs in cs_iter:
log.debug(' >> %s/%s', cs, total)
raw_id=unicode(cs.raw_id),
date=cs._timestamp,
author=cs.author,
message=cs.message,
last=cs.last,
added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
parents=u' '.join([cs.raw_id for cs in cs.parents]),
log.debug('indexed %d changesets for repo %s', indexed, repo_name)
return indexed
def index_files(self, file_idx_writer, repo_name, repo):
Index files for given repo_name
:param file_idx_writer: the whoosh index writer to add to
:param repo_name: name of the repository we're indexing
:param repo: instance of vcs repo
i_cnt = iwc_cnt = 0
log.debug('building index for %s @revision:%s', repo.path,
self._get_index_revision(repo))
for idx_path in self.get_paths(repo):
i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name, index_rev)
i_cnt += i
iwc_cnt += iwc
log.debug('added %s files %s with content for repo %s',
i_cnt + iwc_cnt, iwc_cnt, repo.path)
return i_cnt, iwc_cnt
def update_changeset_index(self):
Status change: