Source code for scilpy.stats.stats

# -*- coding: utf-8 -*-

import logging

from itertools import combinations

import scipy.stats


[docs] def verify_normality(data, alpha=0.05): """ Parameters ---------- data: array_like Array of sample data to test normality on. Should be of 1 dimension. alpha: float Type 1 error of the normality test. Probability of false positive or rejecting null hypothesis when it is true. Returns ------- normality: bool Whether or not the sample can be considered normal p_value: float Probability to obtain an effect at least as extreme as the one in the current sample, assuming the null hypothesis. We reject the null hypothesis when this value is lower than alpha. """ normality = True # First, we verify if sample pass Shapiro-Wilk test W, p_value = scipy.stats.shapiro(data) if p_value < alpha and len(data) < 30: logging.info('The data sample can not be considered normal') normality = False else: logging.info('The data sample pass the normality assumption.') normality = True return normality, p_value
[docs] def verify_homoscedasticity(data_by_group, normality=False, alpha=0.05): """ Parameters ---------- data_by_group: list of array_like The sample data separated by groups. Possibly of different group size. normality: bool Whether or not the sample data of each groups can be considered normal alpha: float Type 1 error of the equality of variance test Probability of false positive or rejecting null hypothesis when it is true. Returns ------- test: string Name of the test done to verify homoscedasticity homoscedasticity: bool Whether or not the equality of variance across groups can be assumed p_value: float Probability to obtain an effect at least as extreme as the one in the current sample, assuming the null hypothesis. We reject the null hypothesis when this value is lower than alpha. """ total_nb = 0 for group in data_by_group: nb_current_group = len(group) total_nb += nb_current_group mean_nb = total_nb / len(data_by_group) if normality: test = 'Bartlett' W, p_value = scipy.stats.bartlett(*data_by_group) else: test = 'Levene' W, p_value = scipy.stats.levene(*data_by_group) logging.info('Test name: {}'.format(test)) if p_value < alpha and mean_nb < 30: logging.info('The sample didnt pass the equal variance assumption') homoscedasticity = False else: logging.info('The sample pass the equal variance assumption') homoscedasticity = True return test, homoscedasticity, p_value
[docs] def verify_group_difference(data_by_group, normality=False, homoscedasticity=False, alpha=0.05): """ Parameters ---------- data_by_group: list of array_like The sample data separated by groups. Possibly of different group size. normality: bool Whether or not the sample data of each groups can be considered normal. homoscedasticity: bool Whether or not the equality of variance across groups can be assumed. alpha: float Type 1 error of the equality of variance test. Probability of false positive or rejecting null hypothesis when it is true. Returns ------- test: string Name of the test done to verify group difference. difference: bool Whether or not the variable associated for groups has an effect on the current measurement. p_value: float Probability to obtain an effect at least as extreme as the one in the current sample, assuming the null hypothesis. We reject the null hypothesis when this value is lower than alpha. """ if len(data_by_group) == 2: if normality and homoscedasticity: test = 'Student' T, p_value = scipy.stats.ttest_ind(data_by_group[0], data_by_group[1]) elif normality: test = 'Welch' T, p_value = scipy.stats.ttest_ind(data_by_group[0], data_by_group[1], equal_var=False) else: test = 'Mannwhitneyu' T, p_value = scipy.stats.mannwhitneyu(data_by_group[0], data_by_group[1]) # We are doing a 2 tail test p_value = 2 * p_value size_too_small = [len(data_by_group[0]) < 15, len(data_by_group[1]) < 15] if not all(size_too_small): logging.warning('The power of the mann and withney u test' ' might be low due to small sample size') elif len(data_by_group) > 2: if normality and homoscedasticity: test = 'ANOVA' T, p_value = scipy.stats.f_oneway(*data_by_group) else: test = 'Kruskalwallis' T, p_value = scipy.stats.kruskal(*data_by_group) logging.info('Test name: {}'.format(test)) if p_value < alpha: logging.info('There is a difference between groups') difference = True else: logging.info('We are not able to detect difference between' ' the groups.') difference = False return test, difference, p_value
[docs] def verify_post_hoc(data_by_group, groups_list, test, correction=True, alpha=0.05): """ Parameters ---------- data_by_group: list of array_like The sample data separated by groups. Possibly of different lengths group size. groups_list: list of string The names of each group in the same order as data_by_group. test: string The name of the post-hoc analysis test to do. Post-hoc analysis is the analysis of pairwise difference a posteriori of the fact that there is a difference across groups. correction: bool Whether or not to do a Bonferroni correction on the alpha threshold. Used to have a more stable type 1 error across multiple comparison. alpha: float Type 1 error of the equality of variance test. Probability of false positive or rejecting null hypothesis when it is true. Returns ------- differences: list of (string, string, bool) The result of the post-hoc for every groups pairwise combinations. - 1st, 2nd dimension: Names of the groups chosen. - 3rd: Whether or not we detect a pairwise difference on the current measurement. - 4th: P-value of the pairwise difference test. test: string Name of the test done to verify group difference """ logging.info('We need to do a post-hoc analysis since ' 'there is a difference') logging.info('Post-hoc: {} pairwise'.format(test)) differences = [] nb_group = len(groups_list) # Bonferroni correction if correction: nb_combinaison = (nb_group * (nb_group - 1)) / 2 alpha = alpha / nb_combinaison for x, y in combinations(range(nb_group), 2): if test == 'Student': T, p_value = scipy.stats.ttest_ind( data_by_group[x], data_by_group[y]) elif test == 'Mannwhitneyu': T, p_value = scipy.stats.mannwhitneyu( data_by_group[x], data_by_group[y]) elif test == 'Wilcoxon': T, p_value = scipy.stats.wilcoxon( data_by_group[x], data_by_group[y]) differences.append((groups_list[x], groups_list[y], p_value < alpha, p_value)) logging.info('Result:') logging.info(differences) return test, differences