Skip to content

add_hash_columns

Public callable

Add business key and row-level SHA256 hash columns.

Parameters:

Name Type Description Default
df Any

Input pandas or Spark DataFrame.

required
business_keys list[str] | None

Business key columns used to build _business_key_hash.

None
row_hash_columns list[str] | None

Columns used to build _row_hash. When omitted, all non-technical columns are used.

None
include_business_key_hash bool

Whether to add _business_key_hash.

True
include_row_hash bool

Whether to add _row_hash.

True
engine str

Execution engine (auto, pandas, or spark).

"auto"

Returns:

Type Description
Any

DataFrame with hash columns added.

Raises:

Type Description
ValueError

If business key hashing is enabled without business_keys, or if required columns are missing.

Examples:

>>> import pandas as pd
>>> df = pd.DataFrame({"BUSINESS_KEY": ["A1"], "amount": [10.5]})
>>> out = add_hash_columns(df, business_keys=["BUSINESS_KEY"], engine="pandas")
>>> "_row_hash" in out.columns
True
Source code in src/fabricops_kit/technical_columns.py
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
def add_hash_columns(
    df,
    *,
    business_keys: list[str] | None = None,
    row_hash_columns: list[str] | None = None,
    include_business_key_hash: bool = True,
    include_row_hash: bool = True,
    engine: str = "auto",
):
    """Add business key and row-level SHA256 hash columns.

    Parameters
    ----------
    df : Any
        Input pandas or Spark DataFrame.
    business_keys : list[str] | None, optional
        Business key columns used to build ``_business_key_hash``.
    row_hash_columns : list[str] | None, optional
        Columns used to build ``_row_hash``. When omitted, all non-technical columns are used.
    include_business_key_hash : bool, default=True
        Whether to add ``_business_key_hash``.
    include_row_hash : bool, default=True
        Whether to add ``_row_hash``.
    engine : str, default="auto"
        Execution engine (``auto``, ``pandas``, or ``spark``).

    Returns
    -------
    Any
        DataFrame with hash columns added.

    Raises
    ------
    ValueError
        If business key hashing is enabled without `business_keys`, or if required columns are missing.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({"BUSINESS_KEY": ["A1"], "amount": [10.5]})
    >>> out = add_hash_columns(df, business_keys=["BUSINESS_KEY"], engine="pandas")
    >>> "_row_hash" in out.columns
    True
    """
    selected_engine = _resolve_engine(df, engine)

    if include_business_key_hash:
        if not business_keys:
            raise ValueError("business_keys must be provided when include_business_key_hash=True.")
        _assert_columns_exist(df, business_keys)

    if include_row_hash:
        row_hash_columns = row_hash_columns or _non_technical_columns(df)
        _assert_columns_exist(df, row_hash_columns)

    if selected_engine == "pandas":
        out = df.copy()
        if include_business_key_hash:
            out["_business_key_hash"] = out[business_keys].apply(lambda row: _hash_row(row.tolist()), axis=1)
        if include_row_hash:
            out["_row_hash"] = out[row_hash_columns].apply(lambda row: _hash_row(row.tolist()), axis=1)
        return out

    from pyspark.sql import functions as F

    out = df
    if include_business_key_hash:
        business_exprs = [F.coalesce(F.col(c).cast("string"), F.lit("<NULL>")) for c in business_keys]
        out = out.withColumn("_business_key_hash", F.sha2(F.concat_ws("||", *business_exprs), 256))
    if include_row_hash:
        row_exprs = [F.coalesce(F.col(c).cast("string"), F.lit("<NULL>")) for c in row_hash_columns]
        out = out.withColumn("_row_hash", F.sha2(F.concat_ws("||", *row_exprs), 256))
    return out