diff --git a/rhodecode/lib/indexers/daemon.py b/rhodecode/lib/indexers/daemon.py --- a/rhodecode/lib/indexers/daemon.py +++ b/rhodecode/lib/indexers/daemon.py @@ -38,34 +38,17 @@ from os.path import join as jn project_path = dn(dn(dn(dn(os.path.realpath(__file__))))) sys.path.append(project_path) - +from rhodecode.config.conf import INDEX_EXTENSIONS from rhodecode.model.scm import ScmModel -from rhodecode.lib import safe_unicode -from rhodecode.lib.indexers import INDEX_EXTENSIONS, SCHEMA, IDX_NAME +from rhodecode.lib.utils2 import safe_unicode +from rhodecode.lib.indexers import SCHEMA, IDX_NAME from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \ NodeDoesNotExistError from whoosh.index import create_in, open_dir - -log = logging.getLogger('whooshIndexer') -# create logger -log.setLevel(logging.DEBUG) -log.propagate = False -# create console handler and set level to debug -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) - -# create formatter -formatter = logging.Formatter("%(asctime)s - %(name)s -" - " %(levelname)s - %(message)s") - -# add formatter to ch -ch.setFormatter(formatter) - -# add ch to logger -log.addHandler(ch) +log = logging.getLogger('whoosh_indexer') class WhooshIndexingDaemon(object): @@ -103,7 +86,8 @@ class WhooshIndexingDaemon(object): self.initial = True def get_paths(self, repo): - """recursive walk in root dir and return a set of all path in that dir + """ + recursive walk in root dir and return a set of all path in that dir based on repository walk function """ index_paths_ = set() @@ -127,32 +111,39 @@ class WhooshIndexingDaemon(object): return mktime(node.last_changeset.date.timetuple()) def add_doc(self, writer, path, repo, repo_name): - """Adding doc to writer this function itself fetches data from - the instance of vcs backend""" - node = self.get_node(repo, path) + """ + Adding doc to writer this function itself fetches data from + the instance of vcs backend + """ - #we just index the content of chosen files, and skip binary files + node = self.get_node(repo, path) + indexed = indexed_w_content = 0 + # we just index the content of chosen files, and skip binary files if node.extension in INDEX_EXTENSIONS and not node.is_binary: - u_content = node.content if not isinstance(u_content, unicode): log.warning(' >> %s Could not get this content as unicode ' - 'replacing with empty content', path) + 'replacing with empty content' % path) u_content = u'' else: log.debug(' >> %s [WITH CONTENT]' % path) + indexed_w_content += 1 else: log.debug(' >> %s' % path) - #just index file name without it's content + # just index file name without it's content u_content = u'' + indexed += 1 - writer.add_document(owner=unicode(repo.contact), - repository=safe_unicode(repo_name), - path=safe_unicode(path), - content=u_content, - modtime=self.get_node_mtime(node), - extension=node.extension) + writer.add_document( + owner=unicode(repo.contact), + repository=safe_unicode(repo_name), + path=safe_unicode(path), + content=u_content, + modtime=self.get_node_mtime(node), + extension=node.extension + ) + return indexed, indexed_w_content def build_index(self): if os.path.exists(self.index_location): @@ -164,19 +155,25 @@ class WhooshIndexingDaemon(object): idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME) writer = idx.writer() - + log.debug('BUILDIN INDEX FOR EXTENSIONS %s' % INDEX_EXTENSIONS) for repo_name, repo in self.repo_paths.items(): log.debug('building index @ %s' % repo.path) - + i_cnt = iwc_cnt = 0 for idx_path in self.get_paths(repo): - self.add_doc(writer, idx_path, repo, repo_name) + i, iwc = self.add_doc(writer, idx_path, repo, repo_name) + i_cnt += i + iwc_cnt += iwc + log.debug('added %s files %s with content for repo %s' % ( + i_cnt + iwc_cnt, iwc_cnt, repo.path) + ) log.debug('>> COMMITING CHANGES <<') writer.commit(merge=True) log.debug('>>> FINISHED BUILDING INDEX <<<') def update_index(self): - log.debug('STARTING INCREMENTAL INDEXING UPDATE') + log.debug('STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s' % + INDEX_EXTENSIONS) idx = open_dir(self.index_location, indexname=self.indexname) # The set of all paths in the index @@ -215,14 +212,19 @@ class WhooshIndexingDaemon(object): # Loop over the files in the filesystem # Assume we have a function that gathers the filenames of the # documents to be indexed + ri_cnt = riwc_cnt = 0 for repo_name, repo in self.repo_paths.items(): for path in self.get_paths(repo): if path in to_index or path not in indexed_paths: # This is either a file that's changed, or a new file # that wasn't indexed before. So index it! - self.add_doc(writer, path, repo, repo_name) + i, iwc = self.add_doc(writer, path, repo, repo_name) log.debug('re indexing %s' % path) - + ri_cnt += i + riwc_cnt += iwc + log.debug('added %s files %s with content for repo %s' % ( + ri_cnt + riwc_cnt, riwc_cnt, repo.path) + ) log.debug('>> COMMITING CHANGES <<') writer.commit(merge=True) log.debug('>>> FINISHED REBUILDING INDEX <<<')