Source code for dataiter.geojson

# -*- coding: utf-8 -*-

# Copyright (c) 2020 Osmo Salomaa
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import json

from attd import AttributeDict
from dataiter import DataFrame
from dataiter import DataFrameColumn
from dataiter import util
from dataiter import Vector
from math import inf


[docs]
class GeoJSON(DataFrame):

    """
    A class for GeoJSON data.

    GeoJSON is a simple wrapper class that reads GeoJSON features into a
    :class:`.DataFrame`. Any operations on the data are thus done with
    methods provided by the data frame class. Geometry is available in the
    "geometry" column, but no special geometric operations are supported.

    All other data is available in the "metadata" attribute as an
    ``attd.AttributeDict``.
    """

    # List of names that are actual attributes, not columns
    ATTRIBUTES = DataFrame.ATTRIBUTES + ["metadata"]

    # Lists of supported GeoJSON keys and types
    FEATURE_KEYS = ["type", "properties", "geometry"]
    FEATURE_TYPES = ["Feature"]
    PROPERTY_TYPES = [bool, int, float, str, type(None)]
    TOP_LEVEL_TYPES = ["FeatureCollection"]


[docs]
    def __init__(self, *args, **kwargs):
        """
        Return a new GeoJSON object.

        `args` and `kwargs` are like for ``dict``.

        https://docs.python.org/3/library/stdtypes.html#dict
        """
        super().__init__(*args, **kwargs)
        self.metadata = AttributeDict(type="FeatureCollection")


    @classmethod
    def _check_raw_data(cls, data):
        if data.type not in cls.TOP_LEVEL_TYPES:
            raise TypeError(f"Top-level type {data.type!r} not supported")
        warned_feature_keys = []
        for feature in data.features:
            cls._check_raw_feature(feature, warned_feature_keys)

    @classmethod
    def _check_raw_feature(cls, feature, warned_feature_keys):
        if feature.type not in cls.FEATURE_TYPES:
            raise TypeError(f"Feature type {feature.type!r} not supported")
        for key in set(feature) - set(cls.FEATURE_KEYS):
            if key in warned_feature_keys: continue
            print(f"Warning: Ignoring feature key {key!r}")
            warned_feature_keys.append(key)
        for key, value in feature.properties.items():
            if isinstance(value, tuple(cls.PROPERTY_TYPES)): continue
            raise TypeError(f"Property type {type(value)} of {key!r} not supported")


[docs]
    @classmethod
    def read(cls, path, *, encoding="utf-8", columns=[], strings_as_object=inf, dtypes={}, **kwargs):
        """
        Return data from GeoJSON file `path`.

        Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``.

        `columns` is an optional list of columns to limit to.

        `strings_as_object` is a cutoff point. If any row has more characters
        than that, the whole column will use the object data type. This is
        intended to help limit memory use as NumPy strings are fixed-length and
        can take a huge amount of memory if even a single row is long. If set,
        `dtypes` overrides this.

        `dtypes` is an optional dict mapping column names to NumPy datatypes.

        `kwargs` are passed to ``json.load``.
        """
        if (not isinstance(strings_as_object, (int, float)) or
            isinstance(strings_as_object, bool)):
            raise TypeError("Expected a number for strings_as_object")
        with util.xopen(path, "rt", encoding=encoding) as f:
            raw = AttributeDict(json.load(f, **kwargs))
        cls._check_raw_data(raw)
        data = {}
        for feature in raw.features:
            for key in feature.properties:
                data.setdefault(key, [])
        if columns:
            data = {k: v for k, v in data.items() if k in columns}
        for feature in raw.features:
            for key in data:
                value = feature.properties.get(key, None)
                data[key].append(value)
        data["geometry"] = [x.geometry for x in raw.features]
        dtypes = dtypes.copy()
        if strings_as_object < inf:
            for name in data:
                if (data[name] and
                    name not in dtypes and
                    any(isinstance(x, str) for x in data[name]) and
                    any(len(x) > strings_as_object for x in data[name] if isinstance(x, str))):
                    dtypes[name] = object
        for name, dtype in dtypes.items():
            data[name] = DataFrameColumn(data[name], dtype)
        data = cls(**data)
        del raw.features
        data.metadata = raw
        return data



[docs]
    def to_data_frame(self, drop_geometry=False):
        """
        Return GeoJSON converted to a regular data frame.
        """
        data = dict.copy(self)
        if drop_geometry:
            data.pop("geometry", None)
        return DataFrame(**data)


    def to_string(self, *, max_rows=None, max_width=None):
        data = self
        if "geometry" in data.colnames:
            geometry = [f"<{x['type']}>" for x in data.geometry]
            data = data.modify(geometry=Vector.fast(geometry, object))
        return DataFrame.to_string(data, max_rows=max_rows, max_width=max_width)


[docs]
    def write(self, path, *, encoding="utf-8", **kwargs):
        """
        Write data to GeoJSON file `path`.

        Will automatically compress if `path` ends in ``.bz2|.gz|.xz``.

        `kwargs` are passed to ``json.dump``.
        """
        kwargs.setdefault("default", str)
        kwargs.setdefault("ensure_ascii", False)
        indent_width = kwargs.pop("indent", 2) or 0
        indent1 = " " * indent_width * 1
        indent2 = " " * indent_width * 2
        if "geometry" not in self:
            raise ValueError("Geometry missing")
        data = self.to_list_of_dicts()
        util.makedirs_for_file(path)
        with util.xopen(path, "wt", encoding=encoding) as f:
            f.write("{\n")
            for key, value in self.metadata.items():
                blob = json.dumps(value, **kwargs)
                f.write(f'{indent1}"{key}": {blob},\n')
            f.write(f'{indent1}"features": [\n')
            for i, item in enumerate(data):
                geometry = item.pop("geometry")
                blob = {"type": "Feature", "properties": item, "geometry": geometry}
                blob = json.dumps(blob, **kwargs)
                comma = "," if i < len(data) - 1 else ""
                f.write(f"{indent2}{blob}{comma}\n")
            f.write(f"{indent1}]\n")
            f.write("}\n")