process.py

import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import LineString, Point, Polygon, box
from shapely.ops import unary_union
from tqdm import tqdm
import pandas as pd
from typing import Dict, Union, Callable
import numpy as np

def project_onto_streets(
    point_series: gpd.GeoSeries,
    street_series: gpd.GeoSeries,
    epsg: str = "EPSG:31287",
    plot: bool = False,
) -> (gpd.GeoSeries, gpd.GeoSeries):
    """
    Projects each point from the point_series onto the closest element of
    street_series according to a normal projection. For that, both
    series will be transformed into a 2D projection (default EPSG:31287).

    :param point_series: GeoSeries of all points to project onto streets.
    :param street_series: GeoSeries of linestrings representing the streets.
    :param epsg: EPSG projection to use during the projection operation.
    :param plot: Boolean to indiceate whether the result of the operation shall be plotted.

    :return: A tuple of two GeoSeries containing the result of the operation as well as the deviation introduced.
    """

    if epsg == "EPSG:4326":
        raise ValueError(
            "Projection cannot be conducted using geographical coordinates such as EPSG:4326. Select 2D projection instead."
        )

    if point_series.crs is None or street_series.crs is None:
        raise ValueError(
            "CRS needs to be set explicitly for passed Geoseries: Use .set_crs()."
        )

    point_original_crs, street_original_crs = point_series.crs, street_series.crs
    point_series = point_series.to_crs(epsg)
    street_series = street_series.to_crs(epsg)

    projected = []
    for point in tqdm(point_series):
        street_ind = street_series.distance(point).argmin()
        projected.append(
            street_series.loc[street_ind].interpolate(
                street_series.loc[street_ind].project(point)
            ).iloc[0]
        )

    projected = gpd.GeoSeries(projected, index=point_series.index)
    projected = projected.set_crs(epsg)
    deviation_projection = projected.distance(point_series)

    if plot:
        plt.figure()
        plt.title("Deviation Histogram")
        deviation_projection.plot.hist(bins=50)
        plt.show()

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
        ax1.set_title("Raw Measurements")
        ax2.set_title("After Projection")
        ax1.scatter(point_series.x, point_series.y, label="Point Series")
        clb = ax2.scatter(
            projected.x, projected.y, c=deviation_projection, label="Projected Series"
        )
        for street in street_series:
            ax1.plot(*street.xy, color="black")
            ax2.plot(*street.xy, color="black")
        bar = fig.colorbar(clb)
        bar.ax.set_title("Deviation [Meters]")
        ax1.legend()
        ax2.legend()
        plt.show()

    # Reset Street and Point Series CRS
    street_series = street_series.to_crs(street_original_crs)
    point_series = point_series.to_crs(point_original_crs)
    projected = projected.to_crs(point_original_crs)

    return projected, deviation_projection


def crop_geoseries_by_bounds(
    original_geoseries: gpd.GeoSeries, filter_geoseries: gpd.GeoSeries
) -> gpd.GeoSeries:
    """
    Returns cut off geoseries from 'original_geoseries' which lies within the bounds on the filter_geoseries

    :param original_geoseries: A geopandas geoseries of points/streets/polygons to be filtered, represented in a EPSG:4326 projection
    :param filter_geoseries: A geopandas geoseries based on which a bounding box is for filter is made

    :return: geopandas geoseries of same format as original_geoseries
    """
    if original_geoseries.crs != "EPSG:4326" or filter_geoseries.crs != "EPSG:4326":
        raise ValueError("Make sure to pass data with EPSG:4326 projection")

    filter_bounds = filter_geoseries.total_bounds
    bound_box = box(
        filter_bounds[0], filter_bounds[1], filter_bounds[2], filter_bounds[3]
    )
    bound_box = gpd.GeoDataFrame(geometry=gpd.GeoSeries(bound_box)).set_crs(
        "EPSG:4326"
    )  # create geodataframe for overlay
    filtered_series = (
        gpd.overlay(
            gpd.GeoDataFrame(geometry=original_geoseries), bound_box, how="intersection"
        )
    ).geometry

    return filtered_series


def interpolate_series_at_distance(
    street_series: gpd.GeoSeries, distance: int = 2
) -> gpd.GeoSeries:
    """
    Interpolates along the geoseries streets_series to get nodes at a distance 'distance' apart.
    Make sure to pass data in Cartesian coordinates (e.g. use street_series.to_crs('EPSG:31287') for Austria).

    :param street_series: geopandas geoseries
    :param distance: integer determining the spacing between interpolation points, default=2

    :return: geopandas geoseries overlapping with original geoseries containing more frequent nodes
    """
    if street_series.crs == "EPSG:4326":
        raise ValueError(
            "Make sure to pass data in Cartesian coordinates (e.g. use street_series.to_crs('EPSG:31287') for Austria)"
        )

    if distance <= 0:
        raise ValueError("Distance has to be a positive number.")

    lines = []
    for linestring in street_series:
        list_points = []  # list to hold all the point coords
        current_dist = distance  # set the current distance to place the point
        line_length = (
            linestring.length
        )  # get the total length of the line (not number of points, but actual length)
        list_points.append(
            Point(list(linestring.coords[0]))
        )  # append the starting coordinate to the list
        while (
            current_dist < line_length
        ):  # while the current cumulative distance is less than the total length of the line
            list_points.append(
                linestring.interpolate(current_dist)
            )  # use interpolate and increase the current distance
            current_dist += distance
        list_points.append(
            Point(list(linestring.coords[-1]))
        )  # append end coordinate to the list
        line = LineString(list_points)
        lines.append(line)

    interpolated_street_series = gpd.GeoSeries(lines)
    return interpolated_street_series


def dissolve_touching_polygons(polygon_series: gpd.GeoSeries) -> gpd.GeoSeries:
    """
    Returns geoseries of dissolved touching polygons with closed polygon holes.

    :param polygon_series: geopandas geoseries of polygons

    :return: geopandas geoseries of dissolved polygons
    """
    poly_list = [
        polygon_series[i]
        for i in range(polygon_series.size)
        if polygon_series[i] != None
    ]
    poly_union = unary_union(
        poly_list
    )  # MultiPolygon containing polygon object sequence
    p_list = []
    for geom in poly_union.geoms:  # loop over polygon object sequence
        p = Polygon(
            geom.exterior
        )  # create new polygon only using the exterior (closes holes)
        p_list.append(p)
    polygon_series_disolved = gpd.GeoSeries(p_list)

    return polygon_series_disolved


def delete_indoor_meas(
    measurement_coords: gpd.GeoSeries, polygons: gpd.GeoSeries
) -> gpd.GeoSeries:
    """
    Removes indoor measurements by deleting all measurement points inside the polygons.

    :param measurement_coords: A geopandas geoseries of Points(lon, lat) representing the measurement positions in a EPSG:4326 projection
    :param polygons: A geopandas geoseries of Polygons in EPSG:4326 projection

    :return: A geopandas geoseries of Points(lon, lat) representing the outdoor measurement positions in a EPSG:4326 projection

    """
    meas_gdf = gpd.GeoDataFrame(geometry=measurement_coords)
    polygons_gdf = gpd.GeoDataFrame(geometry=polygons)

    indoor_meas_gdf = gpd.sjoin(meas_gdf, polygons_gdf)
    outdoor_meas_gdf = meas_gdf.drop(indoor_meas_gdf.index).reset_index()

    return outdoor_meas_gdf.geometry


def assign_landuse_classes(
    point_geodataframe: gpd.GeoDataFrame,
    polygon_geodataframe: gpd.GeoDataFrame,
    cut_polygons_by_buffer: bool = False,
    buffer_size: int = 0,
    category_function : Union[None, Callable] = lambda land_use_code: "Urban" if land_use_code < 200 else "Rural",
    epsg: str = "EPSG:31287",
    n_categories: int = 5
) -> gpd.GeoDataFrame:
    '''
    Returns a DataFrame with the landuse class distribution in the area defined by the buffer_size around the passed locations
    Landuseclasses are defined according to the Corine Land Cover Project:
    https://land.copernicus.eu/user-corner/technical-library/corine-land-cover-nomenclature-guidelines/html

    For Austria the land_use_classes can be obtained via:
    https://www.data.gv.at/katalog/dataset/76617316-b9e6-4bcd-ba09-e328b578fed2

    For Germany the source is:
    https://gis.uba.de/catalog/OpenSearch.do?type=/Query/OpenSearch.do&search=CORINE

    Current versions of the respective land_use_classes source files are provided in the gitlab project under /data

    :param point_geodataframe: A Geodataframe with geometry of type point for which the landuse classes will be evaluated.
    :param polygon_geodataframe: A GeoDataFrame with the landuse class polygons. Can be retrieved from
    :param cut_polygons_by_buffer: If True the polygon_geodataframe will be cut by the total bounds of the point series. Can result in a speedup.
    :param buffer_size: The buffer_size in meters used to construct the area around points for which the distribution of classes will be computed.
    :param category_function: If not None a additional dataframe is returned which aggregates land_use classes to the defined categories
    :param epsg: The projection used for the computation of the area. Defaults to EPSG:31287 for Austria.
    :param n_categories: Maximum number of top categories sorted by the respective percentage values to be considered.

    :return: A GeoDataFrame with columns describing the percentage of the area assigned to the respective land use class.
    '''

    if point_geodataframe.crs is None or polygon_geodataframe.crs is None:
        raise ValueError(
            "CRS needs to be set explicitly for passed Geoseries: Use .set_crs()."
        )

    point_geodataframe['first5codes'] = None
    point_geodataframe['first5percent'] = None

    if buffer_size < 0:
        raise ValueError(
            "Buffer size has to be a positive integer. Default value is zero."
        )
    if buffer_size != 0:
        point_geodataframe.geometry = point_geodataframe.geometry.buffer(buffer_size)

    if cut_polygons_by_buffer == True:
        xmin, ymin, xmax, ymax = point_geodataframe.to_crs(epsg).total_bounds
        polygon_geodataframe = polygon_geodataframe.cx[xmin-buffer_size:xmax+buffer_size, ymin-buffer_size:ymax+buffer_size]

    point_geodataframe = point_geodataframe.to_crs(epsg)
    polygon_geodataframe = polygon_geodataframe.to_crs(epsg)

    total = len(point_geodataframe)
    results = {}

    for i, element in tqdm(point_geodataframe.iterrows(), total=total):
        point_buffer = element.geometry

        percentages = []
        codes = []
        for j, poly in enumerate(polygon_geodataframe.geometry):
            if point_buffer.intersects(poly):
                percentages.append(point_buffer.intersection(poly).area/point_buffer.area*100)
                codes.append(polygon_geodataframe.iloc[j]['CODE_18'])
        code_percent_dict = {}
        for key, value in zip(codes, percentages):
            code_percent_dict[key] = code_percent_dict.get(key, 0) + value
        code_percent_dict = dict(sorted(code_percent_dict.items(), key=lambda x: x[1], reverse =True))
        firstnpairs = {int(k): dict(code_percent_dict)[k] for k in list(dict(code_percent_dict))[:n_categories]}

        results[i] = firstnpairs

    if category_function is not None:
        processed = {}
        for key, result in results.items():
            processed[key] = {}
            for land_use, area_percentage in result.items():
                category = category_function(land_use)
                processed[key][category] = processed[key].get(category, 0) + area_percentage

        processed_result = pd.DataFrame(processed).transpose()
        processed_result.replace(np.nan, 0, inplace=True)

        return results, processed_result

    else:
        return results