Commit 07791eff authored by Sonja Tripkovic's avatar Sonja Tripkovic

added pull_from_geodatenviewer in preprocess

parent 8f8ca1cf
......@@ -2,6 +2,12 @@ import pandas as pd
import itertools
import numpy as np
from tqdm import tqdm
import urllib.request
import os
import zipfile
import geopandas as gpd
from typing import List
import shutil
def link_dataframes(A: pd.DataFrame, B: pd.DataFrame, ref_col: str, metric=None, verbose=True) -> (pd.DataFrame, np.ndarray):
'''
......@@ -59,3 +65,50 @@ def link_dataframes(A: pd.DataFrame, B: pd.DataFrame, ref_col: str, metric=None,
)
return combined, np.array(deviations)
def pull_from_geodatenviewer(dir_path: str, squares: List[int]) -> gpd.GeoDataFrame:
'''
Downloads .zip files for specified squares from https://www.wien.gv.at/ma41datenviewer/public/,
extracts and combines all .shp files to single geopandas geodataframe, then deletes the dir_path directory.
:param dir_path: path to the directory where the files will be downloaded and extracted to
:param squares: list of integers denoting the squares to be downloaded
:return: geopandas geodataframe containing all data from .shp files
ToDo: extend this function to select squares automatically based on measurement coordinates
'''
# creates new directory if it doesn't exist
try:
os.makedirs(dir_path)
except FileExistsError:
raise Exception('The folder under this name already exists, choose another name for your directory.')
# downloading .zip files
# example of single square with data: https://www.wien.gv.at/ma41datenviewer/downloads/ma41/geodaten/fmzk_bkm/103081_bkm.zip
for square_number in squares:
urllib.request.urlretrieve(
"https://www.wien.gv.at/ma41datenviewer/downloads/ma41/geodaten/fmzk_bkm/{}_bkm.zip".format(square_number),
os.path.join(dir_path, "{}.zip".format(square_number))
)
# extracting and deleting .zip files
for item in os.listdir(dir_path): # loop through items in dir
if item.endswith(".zip"):
file_name = os.path.join(dir_path, item)
zip_ref = zipfile.ZipFile(file_name) # create zipfile object
zip_ref.extractall(dir_path) # extract file to dir
zip_ref.close()
os.remove(file_name) # delete zipped file
# combine all .shp files
geo_df_all = pd.DataFrame()
for square in squares:
geo_df = gpd.read_file(os.path.join(dir_path, str(square) + '_bkm.shp'))
geo_df_all = pd.concat([geo_df_all,geo_df],ignore_index=True)
shutil.rmtree(dir_path) # deletes the directory containing all the files
return geo_df_all
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment