Skip to content

Miscellaneous

pangadfs.misc

calculate_jaccard_diversity(lineup1, lineup2)

Calculate Jaccard diversity between two lineups

Parameters:

Name Type Description Default
lineup1

First lineup (array-like of player IDs)

required
lineup2

Second lineup (array-like of player IDs)

required

Returns:

Name Type Description
float

Jaccard diversity (1 - Jaccard similarity)

Examples:

>>> lineup1 = [1, 2, 3, 4, 5]
>>> lineup2 = [1, 2, 6, 7, 8]
>>> diversity = calculate_jaccard_diversity(lineup1, lineup2)
>>> print(f"Diversity: {diversity:.3f}")
Diversity: 0.667
Source code in pangadfs/misc.py
def calculate_jaccard_diversity(lineup1, lineup2):
    """Calculate Jaccard diversity between two lineups

    Args:
        lineup1: First lineup (array-like of player IDs)
        lineup2: Second lineup (array-like of player IDs)

    Returns:
        float: Jaccard diversity (1 - Jaccard similarity)

    Examples:
        >>> lineup1 = [1, 2, 3, 4, 5]
        >>> lineup2 = [1, 2, 6, 7, 8]
        >>> diversity = calculate_jaccard_diversity(lineup1, lineup2)
        >>> print(f"Diversity: {diversity:.3f}")
        Diversity: 0.667
    """
    set1 = set(lineup1)
    set2 = set(lineup2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    similarity = intersection / union if union > 0 else 0.0
    return 1.0 - similarity  # diversity = 1 - similarity

diversity(population)

Calculates diversity of lineups

Parameters:

Name Type Description Default
population ndarray

the population

required

Returns:

Type Description
ndarray

np.ndarray: is square, shape len(population) x len(population)

Source code in pangadfs/misc.py
def diversity(population: np.ndarray) -> np.ndarray:
    """Calculates diversity of lineups

    Args:
        population (np.ndarray): the population

    Returns:
        np.ndarray: is square, shape len(population) x len(population)

    """
    uniques = np.unique(population)
    a = (population[..., None] == uniques).sum(1)
    return np.einsum('ij,kj->ik', a, a)

diversity_optimized(population)

Calculates pairwise diversity between samples (overlap of player IDs).

Parameters:

Name Type Description Default
population ndarray

shape (N, K), where each row is a lineup

required

Returns:

Type Description
ndarray

np.ndarray: shape (N, N), matrix of pairwise overlap scores

Source code in pangadfs/misc.py
def diversity_optimized(population: np.ndarray) -> np.ndarray:
    """
    Calculates pairwise diversity between samples (overlap of player IDs).

    Args:
        population (np.ndarray): shape (N, K), where each row is a lineup

    Returns:
        np.ndarray: shape (N, N), matrix of pairwise overlap scores
    """
    uniques, inverse = np.unique(population, return_inverse=True)
    N, K = population.shape
    U = len(uniques)

    # Construct count matrix a: shape (N, U)
    a = np.zeros((N, U), dtype=np.uint8)
    rows = np.repeat(np.arange(N), K)
    np.add.at(a, (rows, inverse), 1)

    # Pairwise dot product: overlap between lineups
    return a @ a.T

exposure(population=None)

Returns dict of index: count of individuals

Parameters:

Name Type Description Default
population ndarray

the population

None

Returns:

Type Description
Dict[int, int]

Dict[int, int]: key is index, value is count of lineup

Examples:

>>> fittest_population = population[np.where(fitness > np.percentile(fitness, 97))]
>>> exposure = population_exposure(fittest_population)
>>> top_exposure = np.argpartition(np.array(list(exposure.values())), -10)[-10:]
>>> print([round(i, 3) for i in sorted(top_exposure / len(fittest_population), reverse=True)])
Source code in pangadfs/misc.py
def exposure(population: np.ndarray = None) -> Dict[int, int]:
    """Returns dict of index: count of individuals

    Args:
        population (np.ndarray): the population

    Returns:
        Dict[int, int]: key is index, value is count of lineup

    Examples:
        >>> fittest_population = population[np.where(fitness > np.percentile(fitness, 97))]
        >>> exposure = population_exposure(fittest_population)
        >>> top_exposure = np.argpartition(np.array(list(exposure.values())), -10)[-10:]
        >>> print([round(i, 3) for i in sorted(top_exposure / len(fittest_population), reverse=True)])            

    """
    flat = population.flatten
    return dict(zip(flat, np.bincount(flat)[flat]))

multidimensional_shifting(elements, num_samples, sample_size, probs)

Based on https://medium.com/ibm-watson/incredibly-fast-random-sampling-in-python-baf154bd836a

Parameters:

Name Type Description Default
elements iterable

iterable to sample from, typically a dataframe index

required
num_samples int

the number of rows (e.g. initial population size)

required
sample_size int

the number of columns (e.g. team size)

required
probs iterable

is same size as elements

required

Returns:

Name Type Description
ndarray ndarray

of shape (num_samples, sample_size)

Source code in pangadfs/misc.py
def multidimensional_shifting(elements: Iterable, 
                              num_samples: int, 
                              sample_size: int, 
                              probs: Iterable) -> np.ndarray:
    """Based on https://medium.com/ibm-watson/incredibly-fast-random-sampling-in-python-baf154bd836a

    Args:
        elements (iterable): iterable to sample from, typically a dataframe index
        num_samples (int): the number of rows (e.g. initial population size)
        sample_size (int): the number of columns (e.g. team size)
        probs (iterable): is same size as elements

    Returns:
        ndarray: of shape (num_samples, sample_size)

    """
    replicated_probabilities = np.tile(probs, (num_samples, 1))
    random_shifts = np.random.random(replicated_probabilities.shape)
    random_shifts /= random_shifts.sum(axis=1)[:, np.newaxis]
    shifted_probabilities = random_shifts - replicated_probabilities
    samples = np.argpartition(shifted_probabilities, sample_size, axis=1)[:, :sample_size]
    return elements.to_numpy()[samples]

multidimensional_shifting_fast(num_samples, sample_size, probs, elements=None)

High-performance probabilistic sampling using random shifting.

Parameters:

Name Type Description Default
num_samples int

Number of sample rows to generate.

required
sample_size int

Number of items to select per row.

required
probs ndarray

Probability vector of shape (n_elements,), dtype float32 recommended.

required
elements ndarray

Optional array of element IDs (defaults to np.arange(len(probs))).

None

Returns:

Type Description
ndarray

np.ndarray of shape (num_samples, sample_size)

Source code in pangadfs/misc.py
def multidimensional_shifting_fast(
    num_samples: int,
    sample_size: int,
    probs: np.ndarray,
    elements: np.ndarray = None
) -> np.ndarray:
    """
    High-performance probabilistic sampling using random shifting.

    Args:
        num_samples: Number of sample rows to generate.
        sample_size: Number of items to select per row.
        probs: Probability vector of shape (n_elements,), dtype float32 recommended.
        elements: Optional array of element IDs (defaults to np.arange(len(probs))).

    Returns:
        np.ndarray of shape (num_samples, sample_size)
    """
    if elements is None:
        elements = np.arange(len(probs))
    else:
        elements = np.asarray(elements)

    probs = np.asarray(probs, dtype=np.float32)
    rand = np.random.random((num_samples, len(probs))).astype(np.float32)
    rand /= rand.sum(axis=1, keepdims=True)

    shifted = rand - probs
    idx = np.argpartition(shifted, sample_size - 1, axis=1)[:, :sample_size]

    return elements[idx]

multidimensional_shifting_numba(num_samples, sample_size, probs, elements=None)

Numba-accelerated version of multidimensional shifting. Fast for large numbers of samples and small element sets.

Parameters:

Name Type Description Default
num_samples int

Number of rows to sample.

required
sample_size int

Number of items per sample.

required
probs ndarray

Probability vector of shape (n_elements,).

required
elements ndarray

IDs to sample from. Defaults to np.arange(len(probs)).

None

Returns:

Type Description
ndarray

np.ndarray: shape (num_samples, sample_size)

Source code in pangadfs/misc.py
def multidimensional_shifting_numba(
    num_samples: int,
    sample_size: int,
    probs: np.ndarray,
    elements: np.ndarray = None
) -> np.ndarray:
    """
    Numba-accelerated version of multidimensional shifting.
    Fast for large numbers of samples and small element sets.

    Args:
        num_samples (int): Number of rows to sample.
        sample_size (int): Number of items per sample.
        probs (np.ndarray): Probability vector of shape (n_elements,).
        elements (np.ndarray, optional): IDs to sample from. Defaults to np.arange(len(probs)).

    Returns:
        np.ndarray: shape (num_samples, sample_size)
    """
    if elements is None:
        elements = np.arange(len(probs))
    else:
        elements = np.asarray(elements)

    probs = np.asarray(probs, dtype=np.float32)
    indices = _generate_shifted_indices(probs, num_samples, sample_size)
    return elements[indices]

parents(population)

Evenly splits population

Parameters:

Name Type Description Default
population ndarray

the population to crossover. Shape is n_individuals x n_chromosomes.

required

Returns:

Type Description
Tuple[ndarray, ndarray]

Tuple[np.ndarray, np.ndarray]: population split into two equal-size arrays

Source code in pangadfs/misc.py
def parents(population: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """Evenly splits population

    Args:
        population (np.ndarray): the population to crossover. Shape is n_individuals x n_chromosomes.

    Returns:
        Tuple[np.ndarray, np.ndarray]: population split into two equal-size arrays

    """
    fathers, mothers = np.array_split(population, 2)
    size = min(len(fathers), len(mothers))
    return fathers[:size], mothers[:size]