Source code for gmtra.preprocessing

"""
Source code for Global Multihazard Transport Risk Analysis (GMTRA)

Preprocessing functions.

Copyright (C) 2019 Elco Koks. All versions released under the GNU Affero General Public License v3.0 license.
"""

import os
import numpy
import pandas
import geopandas
import country_converter as coco
import urllib.request
from tqdm import tqdm
from pathos.multiprocessing import Pool,cpu_count

from shapely.geometry import MultiPolygon
from geopy.distance import vincenty

from gmtra.utils import load_config,create_folder_lookup,map_roads,line_length
from gmtra.fetch import bridges

[docs]def region_bridges(n): """ This function will extract all bridges from OpenStreetMap for the specified region. Arguments: *n* : the index ID of a region in the specified shapefile with all the regions. Returns: *GeoDataFrame* : A geopandas GeoDataFrame with all bridges in a region. Will also save this to a .csv file. """ # specify the file path where all data is located. data_path = load_config()['paths']['data'] # load shapefile with unique information for each region global_regions = geopandas.read_file(os.path.join(data_path,'input_data','global_regions_v2.shp')) # grab the row of the region from the global region shapefile x = global_regions.iloc[n] # get name of the region region = x.GID_2 # extract bridges from OpenStreetMAp bridges_osm = bridges(data_path,region,regional=True) # estimate length of each bridges in meters bridges_osm['length'] = bridges_osm.geometry.apply(line_length) bridges_osm['length'] = bridges_osm['length']*1000 road_dict = map_roads() # map roads to primary, secondary, tertiary and other roads. bridges_osm['road_type'] = bridges_osm.road_type.apply(lambda y: road_dict[y]) bridges_osm['region'] = region bridges_osm['country'] = region[:3] # save to .csv bridges_osm.to_csv(os.path.join(data_path,'bridges_osm','{}.csv'.format(region))) print('{} finished!'.format(region)) return bridges_osm
# except Exception as e: # print('Failed to finish {} because of {}!'.format(region,e))
[docs]def remove_tiny_shapes(x,regionalized=False): """This function will remove the small shapes of multipolygons. Will reduce the size of the file. Arguments: *x* : a geometry feature (Polygon) to simplify. Countries which are very large will see larger (unhabitated) islands being removed. Optional Arguments: *regionalized* : Default is **False**. Set to **True** will use lower threshold settings (default: **False**). Returns: *MultiPolygon* : a shapely geometry MultiPolygon without tiny shapes. """ # if its a single polygon, just return the polygon geometry if x.geometry.geom_type == 'Polygon': return x.geometry # if its a multipolygon, we start trying to simplify and remove shapes if its too big. elif x.geometry.geom_type == 'MultiPolygon': if regionalized == False: area1 = 0.1 area2 = 250 elif regionalized == True: area1 = 0.01 area2 = 50 # dont remove shapes if total area is already very small if x.geometry.area < area1: return x.geometry # remove bigger shapes if country is really big if x['GID_0'] in ['CHL','IDN']: threshold = 0.01 elif x['GID_0'] in ['RUS','GRL','CAN','USA']: if regionalized == True: threshold = 0.01 else: threshold = 0.01 elif x.geometry.area > area2: threshold = 0.1 else: threshold = 0.001 # save remaining polygons as new multipolygon for the specific country new_geom = [] for y in x.geometry: if y.area > threshold: new_geom.append(y) return MultiPolygon(new_geom)
[docs]def planet_osm(): """ This function will download the planet file from the OSM servers. """ data_path = load_config()['paths']['data'] osm_path_in = os.path.join(data_path,'planet_osm') # create directory to save planet osm file if that directory does not exit yet. if not os.path.exists(osm_path_in): os.makedirs(osm_path_in) # if planet file is not downloaded yet, download it. if 'planet-latest.osm.pbf' not in os.listdir(osm_path_in): url = 'https://planet.openstreetmap.org/pbf/planet-latest.osm.pbf' urllib.request.urlretrieve(url, os.path.join(osm_path_in,'planet-latest.osm.pbf')) else: print('Planet file is already downloaded')
[docs]def global_shapefiles(regionalized=False): """ This function will simplify shapes and add necessary columns, to make further processing more quickly For now, we will make use of the latest GADM data: https://gadm.org/download_world.html Optional Arguments: *regionalized* : Default is **False**. Set to **True** will also create the global_regions.shp file. """ data_path = load_config()['paths']['data'] # path to country GADM file if regionalized == False: # load country file country_gadm_path = os.path.join(data_path,'GADM36','gadm36_0.shp') gadm_level0 = geopandas.read_file(country_gadm_path) # remove antarctica, no roads there anyways gadm_level0 = gadm_level0.loc[~gadm_level0['NAME_0'].isin(['Antarctica'])] # remove tiny shapes to reduce size substantially gadm_level0['geometry'] = gadm_level0.apply(remove_tiny_shapes,axis=1) # simplify geometries gadm_level0['geometry'] = gadm_level0.simplify(tolerance = 0.005, preserve_topology=True).buffer(0.01).simplify(tolerance = 0.005, preserve_topology=True) # add additional info glob_info_path = os.path.join(data_path,'input_data','global_information.xlsx') load_glob_info = pandas.read_excel(glob_info_path) gadm_level0 = gadm_level0.merge(load_glob_info,left_on='GID_0',right_on='ISO_3digit') #save to new country file glob_ctry_path = os.path.join(data_path,'input_data','global_countries.shp') gadm_level0.to_file(glob_ctry_path) else: # this is dependent on the country file, so check whether that one is already created: glob_ctry_path = os.path.join(data_path,'input_data','global_countries.shp') if os.path.exists(glob_ctry_path): gadm_level0 = geopandas.read_file(os.path.join(data_path,'input_data','global_countries.shp')) else: print('ERROR: You need to create the country file first') return None # load region file region_gadm_path = os.path.join(data_path,'GADM36','gadm36_2.shp') gadm_level1 = geopandas.read_file(region_gadm_path) # remove tiny shapes to reduce size substantially gadm_level1['geometry'] = gadm_level1.apply(remove_tiny_shapes,axis=1) # simplify geometries gadm_level1['geometry'] = gadm_level1.simplify(tolerance = 0.005, preserve_topology=True).buffer(0.01).simplify(tolerance = 0.005, preserve_topology=True) # add additional info glob_info_path = os.path.join(data_path,'input_data','global_information.xlsx') load_glob_info = pandas.read_excel(glob_info_path) gadm_level1 = gadm_level1.merge(load_glob_info,left_on='GID_0',right_on='ISO_3digit') gadm_level1.rename(columns={'coordinates':'coordinate'}, inplace=True) # add some missing geometries from countries with no subregions get_missing_countries = list(set(list(gadm_level0.GID_0.unique())).difference(list(gadm_level1.GID_0.unique()))) mis_country = gadm_level0.loc[gadm_level0['GID_0'].isin(get_missing_countries)]# mis_country['GID_1'] = mis_country['GID_0']+'_'+str(0)+'_'+str(1) gadm_level1 = geopandas.GeoDataFrame( pandas.concat( [gadm_level1,mis_country] ,ignore_index=True) ) gadm_level1.reset_index(drop=True,inplace=True) #save to new country file gadm_level1.to_file(os.path.join(data_path,'input_data','global_regions.shp'))
[docs]def poly_files(data_path,global_shape,save_shapefile=False,regionalized=False): """ This function will create the .poly files from the world shapefile. These .poly files are used to extract data from the openstreetmap files. This function is adapted from the OSMPoly function in QGIS. Arguments: *data_path* : base path to location of all files. *global_shape*: exact path to the global shapefile used to create the poly files. Optional Arguments: *save_shape_file* : Default is **False**. Set to **True** will the new shapefile with the countries that we include in this analysis will be saved. *regionalized* : Default is **False**. Set to **True** will perform the analysis on a regional level. Returns: *.poly file* for each country in a new dir in the working directory. """ # ============================================================================= # """ Create output dir for .poly files if it is doesnt exist yet""" # ============================================================================= poly_dir = os.path.join(data_path,'country_poly_files') if regionalized == True: poly_dir = os.path.join(data_path,'regional_poly_files') if not os.path.exists(poly_dir): os.makedirs(poly_dir) # ============================================================================= # """ Set the paths for the files we are going to use """ # ============================================================================= wb_poly_out = os.path.join(data_path,'input_data','country_shapes.shp') if regionalized == True: wb_poly_out = os.path.join(data_path,'input_data','regional_shapes.shp') # ============================================================================= # """Load country shapes and country list and only keep the required countries""" # ============================================================================= wb_poly = geopandas.read_file(global_shape) # filter polygon file if regionalized == True: wb_poly = wb_poly.loc[wb_poly['GID_0'] != '-'] wb_poly = wb_poly.loc[wb_poly['TYPE_1'] != 'Water body'] else: wb_poly = wb_poly.loc[wb_poly['ISO_3digit'] != '-'] wb_poly.crs = {'init' :'epsg:4326'} # and save the new country shapefile if requested if save_shapefile == True: wb_poly.to_file(wb_poly_out) # we need to simplify the country shapes a bit. If the polygon is too diffcult, # osmconvert cannot handle it. # wb_poly['geometry'] = wb_poly.simplify(tolerance = 0.1, preserve_topology=False) # ============================================================================= # """ The important part of this function: create .poly files to clip the country # data from the openstreetmap file """ # ============================================================================= num = 0 # iterate over the counties (rows) in the world shapefile for f in wb_poly.iterrows(): f = f[1] num = num + 1 geom=f.geometry # try: # this will create a list of the different subpolygons if geom.geom_type == 'MultiPolygon': polygons = geom # the list will be lenght 1 if it is just one polygon elif geom.geom_type == 'Polygon': polygons = [geom] # define the name of the output file, based on the ISO3 code ctry = f['GID_0'] if regionalized == True: attr=f['GID_1'] else: attr=f['GID_0'] # start writing the .poly file f = open(poly_dir + "/" + attr +'.poly', 'w') f.write(attr + "\n") i = 0 # loop over the different polygons, get their exterior and write the # coordinates of the ring to the .poly file for polygon in polygons: if ctry == 'CAN': dist = vincenty(polygon.centroid.coords[:1][0], (83.24,-79.80), ellipsoid='WGS-84').kilometers if dist < 2000: continue if ctry == 'RUS': dist = vincenty(polygon.centroid.coords[:1][0], (58.89,82.26), ellipsoid='WGS-84').kilometers if dist < 500: print(attr) continue polygon = numpy.array(polygon.exterior) j = 0 f.write(str(i) + "\n") for ring in polygon: j = j + 1 f.write(" " + str(ring[0]) + " " + str(ring[1]) +"\n") i = i + 1 # close the ring of one subpolygon if done f.write("END" +"\n") # close the file when done f.write("END" +"\n") f.close()
# except: # print(f['GID_1'])
[docs]def clip_osm(data_path,planet_path,area_poly,area_pbf): """ Clip the an area osm file from the larger continent (or planet) file and save to a new osm.pbf file. This is much faster compared to clipping the osm.pbf file while extracting through ogr2ogr. This function uses the osmconvert tool, which can be found at http://wiki.openstreetmap.org/wiki/Osmconvert. Either add the directory where this executable is located to your environmental variables or just put it in the 'scripts' directory. Arguments: *continent_osm*: path string to the osm.pbf file of the continent associated with the country. *area_poly*: path string to the .poly file, made through the 'create_poly_files' function. *area_pbf*: path string indicating the final output dir and output name of the new .osm.pbf file. Returns: a clipped .osm.pbf file. """ print('{} started!'.format(area_pbf)) osm_convert_path = os.path.join(data_path,'osmconvert','osmconvert64') try: if (os.path.exists(area_pbf) is not True): os.system('{} {} -B={} --complete-ways -o={}'.format(osm_convert_path,planet_path,area_poly,area_pbf)) print('{} finished!'.format(area_pbf)) except: print('{} did not finish!'.format(area_pbf))
[docs]def single_country(country,regionalized=False,create_poly_files=False): """ Clip a country from the planet osm file and save to individual osm.pbf files This function has the option to extract individual regions Arguments: *country* : The country for which we want extract the data. Keyword Arguments: *regionalized* : Default is **False**. Set to **True** will parallelize the extraction over all regions within a country. *create_poly_files* : Default is **False**. Set to **True** will create new .poly files. """ # set data path data_path = load_config()['paths']['data'] # path to planet file planet_path = os.path.join(data_path,'planet_osm','planet-latest.osm.pbf') # global shapefile path if regionalized == True: world_path = os.path.join(data_path,'input_data','global_regions.shp') else: world_path = os.path.join(data_path,'input_data','global_countries.shp') # create poly files for all countries if create_poly_files == True: poly_files(data_path,world_path,save_shapefile=False,regionalized=regionalized) if not os.path.exists(os.path.join(data_path,'country_poly_files')): os.makedirs(os.path.join(data_path,'country_poly_files')) if not os.path.exists(os.path.join(data_path,'country_osm')): os.makedirs(os.path.join(data_path,'country_osm')) ctry_poly = os.path.join(data_path,'country_poly_files','{}.poly'.format(country)) ctry_pbf = os.path.join(data_path,'country_osm','{}.osm.pbf'.format(country)) if regionalized == False: clip_osm(data_path,planet_path,ctry_poly,ctry_pbf) elif regionalized == True: if (os.path.exists(ctry_pbf) is not True): clip_osm(data_path,planet_path,ctry_poly,ctry_pbf) if not os.path.exists(os.path.join(data_path,'regional_poly_files')): os.makedirs(os.path.join(data_path,'regional_poly_files')) if not os.path.exists(os.path.join(data_path,'region_osm_admin1')): os.makedirs(os.path.join(data_path,'region_osm_admin1')) get_poly_files = [x for x in os.listdir(os.path.join(data_path,'regional_poly_files')) if x.startswith(country)] polyPaths = [os.path.join(data_path,'regional_poly_files',x) for x in get_poly_files] area_pbfs = [os.path.join(data_path,'region_osm_admin1',x.split('.')[0]+'.osm.pbf') for x in get_poly_files] data_paths = [data_path]*len(polyPaths) planet_paths = [ctry_pbf]*len(polyPaths) # and run all regions parallel to each other pool = Pool(cpu_count()-1) pool.starmap(clip_osm, zip(data_paths,planet_paths,polyPaths,area_pbfs))
[docs]def all_countries(subset = [], regionalized=False,reversed_order=False): """ Clip all countries from the planet osm file and save them to individual osm.pbf files Optional Arguments: *subset* : allow for a pre-defined subset of countries. REquires ISO3 codes. Will run all countries if left empty. *regionalized* : Default is **False**. Set to **True** if you want to have the regions of a country as well. *reversed_order* : Default is **False**. Set to **True** to work backwards for a second process of the same country set to prevent overlapping calculations. Returns: clipped osm.pbf files for the defined set of countries (either the whole world by default or the specified subset) """ # set data path data_path = load_config()['paths']['data'] # path to planet file planet_path = os.path.join(data_path,'planet_osm','planet-latest.osm.pbf') # global shapefile path if regionalized == True: world_path = os.path.join(data_path,'input_data','global_regions.shp') else: world_path = os.path.join(data_path,'input_data','global_countries.shp') # create poly files for all countries poly_files(data_path,world_path,save_shapefile=False,regionalized=regionalized) # prepare lists for multiprocessing if not os.path.exists(os.path.join(data_path,'country_poly_files')): os.makedirs(os.path.join(data_path,'country_poly_files')) if not os.path.exists(os.path.join(data_path,'country_osm')): os.makedirs(os.path.join(data_path,'country_osm')) if regionalized == False: get_poly_files = os.listdir(os.path.join(data_path,'country_poly_files')) if len(subset) > 0: polyPaths = [os.path.join(data_path,'country_poly_files',x) for x in get_poly_files if x[:3] in subset] area_pbfs = [os.path.join(data_path,'region_osm_admin1',x.split('.')[0]+'.osm.pbf') for x in get_poly_files if x[:3] in subset] else: polyPaths = [os.path.join(data_path,'country_poly_files',x) for x in get_poly_files] area_pbfs = [os.path.join(data_path,'region_osm_admin1',x.split('.')[0]+'.osm.pbf') for x in get_poly_files] big_osm_paths = [planet_path]*len(polyPaths) elif regionalized == True: if not os.path.exists(os.path.join(data_path,'regional_poly_files')): os.makedirs(os.path.join(data_path,'regional_poly_files')) if not os.path.exists(os.path.join(data_path,'region_osm')): os.makedirs(os.path.join(data_path,'region_osm_admin1')) get_poly_files = os.listdir(os.path.join(data_path,'regional_poly_files')) if len(subset) > 0: polyPaths = [os.path.join(data_path,'regional_poly_files',x) for x in get_poly_files if x[:3] in subset] area_pbfs = [os.path.join(data_path,'region_osm_admin1',x.split('.')[0]+'.osm.pbf') for x in get_poly_files if x[:3] in subset] big_osm_paths = [os.path.join(data_path,'country_osm',x[:3]+'.osm.pbf') for x in get_poly_files if x[:3] in subset] else: polyPaths = [os.path.join(data_path,'regional_poly_files',x) for x in get_poly_files] area_pbfs = [os.path.join(data_path,'region_osm_admin1',x.split('.')[0]+'.osm.pbf') for x in get_poly_files] big_osm_paths = [os.path.join(data_path,'country_osm',x[:3]+'.osm.pbf') for x in get_poly_files] data_paths = [data_path]*len(polyPaths) # allow for reversed order if you want to run two at the same time (convenient to work backwards for the second process, to prevent overlapping calculation) if reversed_order == True: polyPaths = polyPaths[::-1] area_pbfs = area_pbfs[::-1] big_osm_paths = big_osm_paths[::-1] # extract all country osm files through multiprocesing pool = Pool(cpu_count()-1) pool.starmap(clip_osm, zip(data_paths,big_osm_paths,polyPaths,area_pbfs))
[docs]def merge_SSBN_maps(country): """ Function to merge SSBN maps to a country level. Arguments: *country* : ISO3 code of the country for which we want to merge the river and surface flood maps to country level. """ try: print('{} started!'.format(country)) # get path where all hazards all located hazard_path = load_config()['paths']['hazard_data'] # get dictioniary in which we can lookup the name of the country used in the FATHOM flood files. folder_lookup = create_folder_lookup() # get ISO2 and full country names for each country country_ISO2 = coco.convert(names=[country], to='ISO2') country_full = folder_lookup[country] rps = ['5','10','20','50','75','100','200','250','1000'] flood_types = ['fluvial_undefended','pluvial_undefended'] flood_types_abb = ['FU','PU'] flood_mapping = dict(zip(flood_types,flood_types_abb)) # merge all subcountry files into one country file for each hazard. for flood_type in flood_types: new_folder = os.path.join(hazard_path,'InlandFlooding',country_full,'{}_{}_merged'.format(country_ISO2,flood_type)) try: os.mkdir(new_folder) except: None path_to_all_files = os.path.join(hazard_path,'InlandFlooding',country_full,'{}_{}'.format(country_ISO2,flood_type)) full_paths = [os.path.join(path_to_all_files,x) for x in os.listdir(path_to_all_files) if x.endswith('.tif')] for rp in tqdm(rps,desc=flood_type+'_'+country,leave=False,total=len(rps),unit='rp'): get_one_rp = [x for x in full_paths if '-{}-'.format(rp) in x] stringlist_rp = ' '.join(get_one_rp) rp_out = os.path.join(new_folder,'{}-{}-{}.tif'.format(country_ISO2,flood_mapping[flood_type],rp)) os.system('gdal_merge.py -q -o {} {} -co COMPRESS=LZW -co BIGTIFF=YES -co PREDICTOR=2 -co TILED=YES'.format(rp_out,stringlist_rp)) print('{} finished!'.format(country)) except: print('{} failed! It seems we do not have proper flood data for this country.'.format(country))