# -*- coding: utf-8 -*-
import ast
from colorama import Fore, Style
from importlib.resources import files
import itertools
import multiprocessing
import pathlib
import re
import subprocess
from tqdm import tqdm
import nltk
from nltk.stem import PorterStemmer
from scilpy import SCILPY_HOME
SPACING_LEN = 80
stemmer = PorterStemmer()
# Path to the JSON file containing script information and keywords
VOCAB_FILE_PATH = files("scilpy").joinpath("data/vocabulary.json")
OBJECTS = [
'aodf', 'bids', 'bingham', 'btensor', 'bundle',
'connectivity', 'denoising', 'dki', 'dti', 'dwi',
'fodf', 'freewater', 'frf', 'gradients', 'header',
'json', 'labels', 'lesions', 'mti', 'NODDI', 'sh',
'surface', 'tracking', 'tractogram', 'viz', 'volume',
'qball', 'rgb', 'lesions'
]
[docs]
def prompt_user_for_object():
"""
Prompts the user to select an object from the list of available objects.
"""
print("Available objects:")
for idx, obj in enumerate(OBJECTS):
print(f"{idx + 1}. {obj}")
while True:
try:
choice = int(
input("Choose the object you want to work on "
"(enter the number): "))
if 1 <= choice <= len(OBJECTS):
return OBJECTS[choice - 1]
else:
print(f"Please enter a number between 1 and {len(OBJECTS)}.")
except ValueError:
print("Invalid input. Please enter a number.")
def _make_title(text):
"""
Returns a formatted title string with centered text and spacing
"""
return f'{Fore.LIGHTBLUE_EX}{Style.BRIGHT}{text.center(SPACING_LEN, "=")}' \
f'{Style.RESET_ALL}'
def _get_docstring_from_script_path(script):
"""
Extract a python file's docstring from a filepath.
Parameters
----------
script : str
Path to python file
Returns
-------
docstring : str
The file's docstring, or an empty string if there was no docstring.
"""
with open(script, 'r') as reader:
file_contents = reader.read()
module = ast.parse(file_contents)
docstring = ast.get_docstring(module) or ''
return docstring
def _split_first_sentence(text):
"""
Split the first sentence from the rest of a string by finding the first
dot or newline. If there is no dot or newline, return the full string as
the first sentence, and None as the remaining text.
Parameters
----------
text : str
Text to parse.
Returns
-------
first_sentence : str
The first sentence, or the full text if no dot or newline was found.
remaining : str
Everything after the first sentence.
"""
candidates = ['. ', '.\n']
sentence_idx = -1
for candidate in candidates:
idx = text.find(candidate)
if idx != -1 and idx < sentence_idx or sentence_idx == -1:
sentence_idx = idx
split_idx = (sentence_idx + 1) or None
sentence = text[:split_idx]
remaining = text[split_idx:] if split_idx else ""
return sentence, remaining
def _stem_word(word):
"""
Stem a word using two different stemmers and return the most appropriate
stem.
Parameters
----------
word : str
Word to stem.
Returns
-------
str
Stemmed word.
"""
if len(word) <= 3:
return word
version_b = stemmer.stem(word)
return version_b
def _stem_keywords(keywords):
"""
Stem a list of keywords using PorterStemmer.
Parameters
----------
keywords : list of str
Keywords to be stemmed.
Returns
-------
list of str
Stemmed keywords.
"""
return [_stem_word(keyword) for keyword in keywords]
def _stem_text(text):
"""
Stem all words in a text using PorterStemmer.
Parameters
----------
text : str
Text to be stemmed.
Returns
-------
str
Stemmed text.
"""
words = nltk.word_tokenize(text)
return ' '.join([_stem_word(word) for word in words])
def _stem_phrase(phrase):
"""
Stem all words in a phrase using PorterStemmer.
Parameters
----------
phrase : str
Phrase to be stemmed.
Returns
-------
str
Stemmed phrase.
"""
words = phrase.split()
return ' '.join([_stem_word(word) for word in words])
def _generate_help_file(args):
"""
Generate help file for each script
"""
script = args[0]
hidden_dir = args[1]
help_file = hidden_dir / f'{script.name}.help'
# Check if help file already exists
if help_file.exists():
return
# Run the script with --h and capture the output
result = subprocess.run(['python', script, '--h'],
capture_output=True, text=True)
# Save the output to the hidden file
with open(help_file, 'w') as f:
f.write(result.stdout)
def _generate_help_files(nbr_cpu=1):
"""
This function iterates over all Python scripts in the 'scripts' directory,
runs each script with the '--h' flag to generate help text,
and saves the output in the '.hidden' directory.
By doing this, we can precompute the help outputs for each script,
which can be useful for faster searches.
If a help file already exists for a script, the script is skipped,
and the existing help file is left unchanged.
The help output is saved in a hidden directory to avoid clutter in
the main scripts directory.
"""
scripts_dir = files("scilpy").joinpath("cli")
# Hidden directory to store help files
hidden_dir = pathlib.Path(SCILPY_HOME) / ".hidden"
hidden_dir.mkdir(exist_ok=True)
scripts = [script for script in scripts_dir.glob('*.py')
if script.name not in ['__init__.py',
'scil_search_keywords.py']]
helps = [help for help in hidden_dir.glob('*.help')]
scripts_to_regenerate = [script for script in scripts
if hidden_dir / f'{script.name}.help' not in helps]
# Check if all help files are present
if len(scripts_to_regenerate) == 0:
print("All help files are already generated.")
return
with multiprocessing.Pool(processes=nbr_cpu) as pool:
_ = list(tqdm(pool.imap(_generate_help_file,
zip(scripts_to_regenerate,
itertools.repeat(hidden_dir))),
total=len(scripts_to_regenerate)))
def _highlight_keywords(text, all_expressions):
"""
Highlight the stemmed keywords in the given text using colorama.
Parameters
----------
text : str
Text to highlight keywords in.
all_expressions : list of str
List of all things to highlight.
Returns
-------
str
Text with highlighted keywords.
"""
# Iterate over each keyword in the list
for kw in all_expressions:
# Create a regex pattern to match any word containing the keyword
pattern = re.compile(
r'\b(\w?' + re.escape(kw) + r's?\w?)\b', re.IGNORECASE)
# Function to apply highlighting to the matched word
def apply_highlight(match):
return f'{Fore.LIGHTYELLOW_EX}{Style.BRIGHT}{match.group(0)}' \
f'{Style.RESET_ALL}'
# Replace the matched word with its highlighted version
text = pattern.sub(apply_highlight, text)
return text
def _get_synonyms(keyword, synonyms_data):
"""
Get synonyms for a given keyword from the synonyms data.
Parameters
----------
keyword : str
Keyword to find synonyms for.
synonyms_data : dict
Dictionary containing synonyms data.
Returns
-------
list of str
List of synonyms for the given keyword.
"""
keyword = keyword.lower()
complete_synonyms = []
for synonym_set in synonyms_data:
synonym_set = [synonym.lower() for synonym in synonym_set]
stemmed_synonyms_set = [_stem_word(synonym) for synonym in synonym_set]
if keyword in synonym_set or _stem_word(keyword) in stemmed_synonyms_set:
complete_synonyms.extend(synonym_set)
return list(set(complete_synonyms))
def _extract_keywords_and_phrases(expressions):
"""
Extract keywords and phrases from the provided list.
Parameters
----------
expressions : list of str
List of keywords and phrases.
Returns
-------
list of str, list of str
List of individual keywords and list of phrases.
"""
keywords_set = set()
phrases_set = set()
for expression in expressions:
# if keyword contain blank space (contains more that 1 word)
if ' ' in expression:
phrases_set.add(expression.lower())
else:
keywords_set.add(expression.lower())
return list(keywords_set), list(phrases_set)
def _calculate_score(keywords, phrases, text, filename, suffix=''):
"""
Calculate a score for how well the text and filename match the keywords.
Parameters
----------
keywords : list of str
Keywords to search for.
phrases : list of str
Phrases to search for.
text : str
Text to search within.
filename : str
Filename to search within.
Returns
-------
dict
Score details based on the frequency of keywords
in the text and filename.
"""
stemmed_text = _stem_text(text.lower())
stemmed_filename = _stem_text(filename.lower())
score_details = {}
def is_match(found_word, keyword):
if len(keyword) <= 3:
return found_word == keyword
return _stem_word(found_word) == _stem_word(keyword)
for keyword in keywords:
keyword = keyword.lower()
# Use regular expressions to match whole words only
keyword_pattern = re.compile(
r'\b(\w?' + re.escape(keyword) + r's?\w?)\b', re.IGNORECASE)
found_words = keyword_pattern.findall(stemmed_text) \
+ keyword_pattern.findall(stemmed_filename)
keyword_score = 0
for found_word in found_words:
if is_match(found_word, keyword):
keyword_score += 1
continue
if keyword_score > 0:
score_details[keyword + suffix] = keyword_score
for phrase in phrases:
phrase_stemmed = _stem_text(phrase.lower())
phrase_score = stemmed_text.count(phrase_stemmed)
if phrase_score > 0:
score_details[phrase + suffix] = phrase_score
return score_details
[docs]
def update_matches_and_scores(scores, filename, score_details):
"""
Update the matches and scores for the given filename based
on the score details.
Parameters
----------
scores : dict
A dictionary containing the scores for the keywords (to be updated).
filename : str
The name of the script file being analyzed.
score_details : dict
A dictionary containing the scores for the keywords
and phrases found in the script.
Returns
-------
None
Just updates the global `matches` and `scores` lists/dictionaries.
"""
for key, value in score_details.items():
if value == 0:
continue
if filename not in scores:
scores[filename] = {key: value}
elif key not in scores[filename]:
scores[filename].update({key: value})
else:
scores[filename][key] += value
return scores