Source code for measprocess.preprocess
import pandas as pd
import itertools
import numpy as np
from tqdm import tqdm
[docs]def link_dataframes(A: pd.DataFrame, B: pd.DataFrame, ref_col: str, metric=None, verbose=True) -> (pd.DataFrame, np.ndarray):
'''
Merge two DataFrames A and B according to the reference colum based on minimum metric.
Note that the final dataframe will include duplicate entries from B, while entries from A will be unique
:param ref_col: Reference Column to merge dataframes. Has to exist in both frames
:param metric: Metric used to determine matches in ref_col. Default lambda a, b: (a - b).abs()
:return: Tuple of Merged DataFrame with Multindex and Deviation
'''
try:
A[ref_col].iloc[0] - B[ref_col].iloc[0]
except Exception:
raise ValueError("Reference columns has to be numeric")
if not metric:
metric = lambda a, b: (a - b).abs()
indices, deviations = [], []
for _, element in tqdm(A.iterrows(), total=A.shape[0], disable=(not verbose), leave=False, desc="Linking Dataframes"):
distances = metric(
element[ref_col],
B[ref_col]
)
deviations.append(
distances.iloc[distances.argmin()]
)
indices.append(
distances.argmin()
)
#Surpress irrelevant error warning regarding assigment
pd.options.mode.chained_assignment = None
B_with_duplicates = B.iloc[indices]
B_with_duplicates.columns = B.columns
B_with_duplicates.index = A.index
B_with_duplicates['original_indices'] = indices
combined = pd.concat((A, B_with_duplicates), axis=1)
multindex_keys = list(
itertools.chain(
(('A', col) for col in A.columns),
(('B', col) for col in B_with_duplicates.columns)
)
)
combined.columns = pd.MultiIndex.from_tuples(
multindex_keys
)
return combined, np.array(deviations)