Source code for codemetrics.vega

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import os
import typing

import pandas as pd

from . import internals


[docs]def build_hierarchy(
    data: pd.DataFrame,
    get_parent=os.path.dirname,
    root: str = "",
    max_iter: int = 100,
    col_name: typing.Optional[str] = None,
) -> pd.DataFrame:
    """Build a hierarchy from a data set and a get_parent relationship.

    The output frame adds 2 columns in front: id and parent. Both are numerical
    where the parent id identifies the id of the parent as returned by the
    get_parent function.

    The id of the root element is set to 0 and the parent is set to np.nan.

    Args:
        data: data containing the leaves of the tree.
        get_parent: function returning the parent of an element.
        root: expected root of the hierarchy.
        max_iter: maximum number of iterations.
        col_name: name of the column to use as input (default to column 0).

    Returns:
        pandas.DataFrame with the columns id, parent and col_name.
        The parent value identifies the id of the parent in the hierarchy where
        the id 0 is the root. The columns other than col_name are discarded.

    """
    assert data.ndim == 2, "DataFrame-like object expected"
    if not col_name:
        col_name = data.columns[0]
    parent = get_parent.__name__
    df = data[[col_name]]
    frames = []
    seen = {root}
    root_actually_seen = False
    count = 0
    for _ in range(max_iter):
        df.loc[:, parent] = df[col_name].apply(get_parent)
        if root in df[parent].values:
            root_actually_seen = True
        df["id"] = range(count, count + len(df))
        count += len(df)
        frames.append(df)
        df = (
            df.loc[~df[parent].isin(seen), [parent]]
            .drop_duplicates()
            .rename(columns={parent: col_name})
        )
        seen.update(df[col_name])
        if len(df) == 0:
            frames.append(
                pd.DataFrame(data={"id": [count], col_name: [root], parent: [None]})
            )
            break
    if not root_actually_seen:
        msg = f"cannot find root {root} in input frame"
        internals.log.error(msg)
        raise ValueError(msg)
    df = pd.concat(frames, sort=False).drop_duplicates()
    df["id"] = len(df) - df["id"] - 1
    assert col_name is not None
    y_name = col_name + "_y"
    merged = pd.merge(df, df, left_on=col_name, right_on=parent, how="right")[
        [y_name, "id_y", "id_x"]
    ].rename(columns={y_name: col_name, "id_y": "id", "id_x": "parent"})
    return (
        merged[["id", "parent", col_name]].sort_values(by="id").reset_index(drop=True)
    )


def _vis_generic(
    df: pd.DataFrame,
    size_column: str,
    color_column: str,
    colorscheme: str,
    height: int = 300,
    width: int = 400,
) -> dict:
    """Factors common parts of vis_xxx functions.

    Internal. See vis_hot_spots or vis_ages for documentation.

    """
    if len(df) <= 0:
        raise ValueError("dataframe is empty")
    if size_column not in df.columns:
        raise ValueError(f"{size_column} not found in columns")
    if color_column not in df.columns:
        raise ValueError(f"{color_column} not found in columns")
    hierarchy = build_hierarchy(df[["path"]], root="")
    hierarchy = (
        pd.merge(hierarchy, df, left_on="path", right_on="path", how="left")
        .rename(columns={size_column: "size", color_column: "intensity"})
        .sort_values(by="id")
    )
    hierarchy.loc[:, ["size", "intensity"]] = hierarchy[["size", "intensity"]].fillna(0)
    json_values = hierarchy.to_json(orient="records")
    signal = (
        "datum.path + "
        f"(datum.intensity ? ', ' + datum.intensity + ' {color_column}' : '') + "
        f"(datum.size ? ', ' + datum.size + ' {size_column}' : '')"
    )
    desc: typing.Dict[str, typing.Any] = {
        "$schema": "https://vega.github.io/schema/vega/v4.json",
        "width": width,
        "height": height,
        "padding": 5,
        "autosize": "none",
        "data": [
            {
                "name": "tree",
                # 'values':  ...,
                "transform": [
                    {"type": "stratify", "key": "id", "parentKey": "parent"},
                    {
                        "type": "pack",
                        "field": "size",
                        "sort": {"field": "value", "order": "descending"},
                        "size": [{"signal": "width"}, {"signal": "height"}],
                    },
                ],
            }
        ],
        "scales": [
            {
                "name": "color",
                "type": "linear",
                "domain": {"data": "tree", "field": "intensity"},
                "range": {"scheme": colorscheme},
                "domainMin": 0,
            }
        ],
        "marks": [
            {
                "type": "symbol",
                "from": {"data": "tree"},
                "encode": {
                    "enter": {
                        "shape": {"value": "circle"},
                        "fill": {"scale": "color", "field": "intensity"},
                        "tooltip": {"signal": signal},
                    },
                    "update": {
                        "x": {"field": "x"},
                        "y": {"field": "y"},
                        "size": {"signal": "4 * datum.r * datum.r"},
                        "stroke": {"value": "white"},
                        "strokeWidth": {"value": 0.5},
                    },
                    "hover": {
                        "stroke": {"value": "black"},
                        "strokeWidth": {"value": 2},
                    },
                },
            }
        ],
    }
    desc["data"][0]["values"] = json.loads(json_values)
    return desc


[docs]def vis_hot_spots(
    df: pd.DataFrame,
    height: int = 300,
    width: int = 400,
    size_column: str = "lines",
    color_column: str = "changes",
    colorscheme: str = "yelloworangered",
) -> dict:
    """Convert get_hot_spots output to a json vega dict.

    Args:
        df: input data returned by :func:`codemetrics.get_hot_spots`
        height: vertical size of the figure.
        width: horizontal size of the figure.
        size_column: column that drives the size of the circles.
        color_column: column that drives the color intensity of the circles.
        colorscheme: color scheme. See https://vega.github.io/vega/docs/schemes/

    Returns:
        Vega description suitable to be use with Altair.

    Example::

        import codemetrics as cm
        from altair.vega.v4 import Vega
        hspots = cm.get_hot_spots(loc_df, log_df)
        desc = cm.vega.vis_hot_spots(hspots)
        Vega(desc)  # display the visualization inline in you notebook.

    See also:
        `Vega circle pack example`_

    .. _Vega circle pack example: https://vega.github.io/editor/#/examples/vega/circle-packing

    """
    return _vis_generic(
        df,
        size_column=size_column,
        color_column=color_column,
        colorscheme=colorscheme,
        width=width,
        height=height,
    )


[docs]def vis_ages(
    df: pd.DataFrame,
    height: int = 300,
    width: int = 400,
    colorscheme: str = "greenblue",
) -> dict:
    """Convert get_ages output to a json vega dict.

    Args:
        df: input data returned by :func:`codemetrics.get_ages`
        height: vertical size of the figure.
        width: horizontal size of the figure.
        colorscheme: color scheme. See https://vega.github.io/vega/docs/schemes/

    Returns:
        Vega description suitable to be use with Altair.

    Example::

        import codemetrics as cm
        from altair.vega.v4 import Vega
        ages = cm.get_ages(loc_df, log_df)
        desc = cm.vega.vis_ages(ages)
        Vega(desc)  # display the visualization inline in you notebook.

    See also:
        `Vega circle pack example`_

    .. _Vega circle pack example: https://vega.github.io/editor/#/examples/vega/circle-packing

    """
    df["days"] = df["age"].astype("int32")
    df = df.rename(columns={"code": "loc"})
    return _vis_generic(
        df,
        size_column="loc",
        color_column="days",
        colorscheme=colorscheme,
        width=width,
        height=height,
    )