import pandas as pd import itertools import numpy as np from tqdm import tqdm def link_dataframes(A: pd.DataFrame, B: pd.DataFrame, ref_col: str, metric=None, verbose=True) -> (pd.DataFrame, np.ndarray): ''' Merge two DataFrames A and B according to the reference colum based on minimum metric. Note that the final dataframe will include duplicate entries from B, while entries from A will be unique :param ref_col: Reference Column to merge dataframes. Has to exist in both frames :param metric: Metric used to determine matches in ref_col. Default lambda a, b: (a - b).abs() :return: Tuple of Merged DataFrame with Multindex and Deviation ''' try: A[ref_col].iloc[0] - B[ref_col].iloc[0] except Exception: raise ValueError("Reference columns has to be numeric") if not metric: metric = lambda a, b: (a - b).abs() indices, deviations = [], [] for _, element in tqdm(A.iterrows(), total=A.shape[0], disable=(not verbose), leave=False, desc="Linking Dataframes"): distances = metric( element[ref_col], B[ref_col] ) deviations.append( distances.iloc[distances.argmin()] ) indices.append( distances.argmin() ) #Surpress irrelevant error warning regarding assigment pd.options.mode.chained_assignment = None B_with_duplicates = B.iloc[indices] B_with_duplicates.columns = B.columns B_with_duplicates.index = A.index B_with_duplicates['original_indices'] = indices combined = pd.concat((A, B_with_duplicates), axis=1) multindex_keys = list( itertools.chain( (('A', col) for col in A.columns), (('B', col) for col in B_with_duplicates.columns) ) ) combined.columns = pd.MultiIndex.from_tuples( multindex_keys ) return combined, np.array(deviations)