使用xarray与cfgrib读取GRIB文件时部分时间索引触发[Errno 22]无效参数错误
使用xarray与cfgrib读取GRIB文件时部分时间索引触发[Errno 22]无效参数错误
问题背景
我有一个包含再分析数据(an)、集合平均数据(em)和集合扩散数据(es)的GRIB文件,尝试用xarray结合cfgrib引擎读取其中的再分析数据,能正常获取经纬度值,但访问部分时间索引的场变量(比如散度d)时会触发[Errno 22] Invalid argument错误,而且可访问的时间范围还出现了莫名的变化:最初只有0-2索引可用,第二天0-12索引能正常访问,但13及之后的索引仍然报错,期间我完全没修改过文件。
代码示例
import xarray as xr import cfgrib import numpy as np file = 'C:/Users/Downloads/04292020_rean_ensmean_ensspread.grib' # 筛选读取再分析数据 ds = xr.open_dataset(file, engine="cfgrib", filter_by_keys={'dataType': 'an'}) time = 5 res = 2 # 2 = 半度分辨率 [phi, theta] = np.meshgrid(ds.variables['longitude'][::res].values, ds.variables['latitude'][::res].values) # 访问散度数据时触发错误 divergence = ds.variables['d'][time,:,::res,::res].values
数据集基本信息
# <bound method Dataset.head of <xarray.Dataset> Size: 20GB # Dimensions: (time: 16, isobaricInhPa: 23, latitude: 721, longitude: 1440) # Coordinates: # number int32 4B ... # * time (time) datetime64[ns] 128B 2020-04-29 ... 2020-04-29T23:00:00 # step timedelta64[ns] 8B ... # * isobaricInhPa (isobaricInhPa) float64 184B 1e+03 975.0 ... 225.0 200.0 # * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0 # * longitude (longitude) float64 12kB 0.0 0.25 0.5 ... 359.2 359.5 359.8 # valid_time (time) datetime64[ns] 128B ... # Data variables: (12/13) # d (time, isobaricInhPa, latitude, longitude) float32 2GB ... # cc (time, isobaricInhPa, latitude, longitude) float32 2GB ... # z (time, isobaricInhPa, latitude, longitude) float32 2GB ... # pv (time, isobaricInhPa, latitude, longitude) float32 2GB ... # r (time, isobaricInhPa, latitude, longitude) float32 2GB ... # clwc (time, isobaricInhPa, latitude, longitude) float32 2GB ... # ... # crwc (time, isobaricInhPa, latitude, longitude) float32 2GB ... # t (time, isobaricInhPa, latitude, longitude) float32 2GB ... # u (time, isobaricInhPa, latitude, longitude) float32 2GB ... # v (time, isobaricInhPa, latitude, longitude) float32 2GB ... # w (time, isobaricInhPa, latitude, longitude) float32 2GB ... # vo (time, isobaricInhPa, latitude, longitude) float32 2GB ... # Attributes: # GRIB_edition: 1 # GRIB_centre: ecmf # GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts # GRIB_subCentre: 0 # Conventions: CF-1.7 # institution: European Centre for Medium-Range Weather Forecasts # history: 2025-03-23T18:13 GRIB to CDM+CF via cfgrib-0.9.1...>
完整错误回溯
divergence = ds.isel(time=13).d.data --------------------------------------------------------------------------- OSError Traceback (most recent call last) Cell In[30], line 1 ----> 1 divergence = ds.isel(time=13).d.data File ~\anaconda3\Lib\site-packages\xarray\core\dataarray.py:795, in DataArray.data(self) 783 @property 784 def data(self) -> Any: 785 """ 786 The DataArray's data as an array. The underlying array type 787 (e.g. dask, sparse, pint) is preserved. (...) 793 DataArray.values 794 """ --> 795 return self.variable.data File ~\anaconda3\Lib\site-packages\xarray\core\variable.py:474, in Variable.data(self) 472 return self._data 473 elif isinstance(self._data, indexing.ExplicitlyIndexed): --> 474 return self._data.get_duck_array() 475 else: 476 return self.values File ~\anaconda3\Lib\site-packages\xarray\core\indexing.py:840, in MemoryCachedArray.get_duck_array(self) 839 def get_duck_array(self): --> 840 self._ensure_cached() 841 return self.array.get_duck_array() File ~\anaconda3\Lib\site-packages\xarray\core\indexing.py:837, in MemoryCachedArray._ensure_cached(self) 836 def _ensure_cached(self): --> 837 self.array = as_indexable(self.array.get_duck_array()) File ~\anaconda3\Lib\site-packages\xarray\core\indexing.py:794, in CopyOnWriteArray.get_duck_array(self) 793 def get_duck_array(self): --> 794 return self.array.get_duck_array() File ~\anaconda3\Lib\site-packages\xarray\core\indexing.py:657, in LazilyIndexedArray.get_duck_array(self) 653 array = apply_indexer(self.array, self.key) 654 else: 655 # If the array is not an ExplicitlyIndexedNDArrayMixin, 656 # it may wrap a BackendArray so use its __getitem__ --> 657 array = self.array[self.key] 659 # self.array[self.key] is now a numpy array when 660 # self.array is a BackendArray subclass 661 # and self.key is BasicIndexer((slice(None, None, None),)) 662 # so we need the explicit check for ExplicitlyIndexed 663 if isinstance(array, ExplicitlyIndexed): File ~\anaconda3\Lib\site-packages\cfgrib\xarray_plugin.py:163, in CfGribArrayWrapper.__getitem__(self, key) 159 def __getitem__( 160 self, 161 key: xr.core.indexing.ExplicitIndexer, 162 ) -> np.ndarray: --> 163 return xr.core.indexing.explicit_indexing_adapter( 164 key, self.shape, xr.core.indexing.IndexingSupport.BASIC, self._getitem 165 ) File ~\anaconda3\Lib\site-packages\xarray\core\indexing.py:1018, in explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method) 996 """Support explicit indexing by delegating to a raw indexing method. 997 998 Outer and/or vectorized indexers are supported by indexing a second time (...) 1015 Indexing result, in the form of a duck numpy-array. 1016 """ 1017 raw_key, numpy_indices = decompose_indexer(key, shape, indexing_support) -> 1018 result = raw_indexing_method(raw_key.tuple) 1019 if numpy_indices.tuple: 1020 # index the loaded np.ndarray 1021 indexable = NumpyIndexingAdapter(result) File ~\anaconda3\Lib\site-packages\cfgrib\xarray_plugin.py:172, in CfGribArrayWrapper._getitem(self, key) 167 def _getitem( 168 self, 169 key: T.Tuple[T.Any, ...], 170 ) -> np.ndarray: 171 with self.datastore.lock: --> 172 return self.array[key] File ~\anaconda3\Lib\site-packages\cfgrib\dataset.py:373, in OnDiskArray.__getitem__(self, item) 371 continue 372 # NOTE: fill a single field as found in the message --> 373 message = self.index.get_field(message_ids[0]) # type: ignore 374 values = get_values_in_order(message, array_field[tuple(array_field_indexes)].shape) 375 array_field.__getitem__(tuple(array_field_indexes)).flat[:] = values File ~\anaconda3\Lib\site-packages\cfgrib\messages.py:488, in FieldsetIndex.get_field(self, message_id) 487 def get_field(self, message_id: T.Any) -> abc.Field: --> 488 return ComputedKeysAdapter(self.fieldset[message_id], self.computed_keys) File ~\anaconda3\Lib\site-packages\cfgrib\messages.py:345, in FileStream.__getitem__(self, item) 343 def __getitem__(self, item: T.Optional[OffsetType]) -> Message: 344 with open(self.path, "rb") as file: --> 345 return self.message_from_file(file, offset=item) File ~\anaconda3\Lib\site-packages\cfgrib\messages.py:341, in FileStream.message_from_file(self, file, offset, **kwargs) 339 def message_from_file(self, file, offset=None, **kwargs): 340 # type: (T.IO[bytes], T.Optional[OffsetType], T.Any) -> Message --> 341 return Message.from_file(file, offset, **kwargs) File ~\anaconda3\Lib\site-packages\cfgrib\messages.py:94, in Message.from_file(cls, file, offset, **kwargs) 92 offset, field_in_message = offset 93 if offset is not None: --> 94 file.seek(offset) 95 codes_id = None 96 if field_in_message == 0: OSError: [Errno 22] Invalid argument
可能的解决方案
从错误回溯来看,问题出在cfgrib尝试定位GRIB文件内的消息偏移量时失败,结合你遇到的随机可访问范围变化,建议尝试以下几种方法:
删除并重建索引文件:cfgrib会自动生成
.idx索引文件缓存消息位置,索引损坏可能导致偏移错误。找到GRIB文件同目录下的.idx文件删除,再重新打开数据集让cfgrib重新生成索引:import os idx_file = 'C:/Users/Downloads/04292020_rean_ensmean_ensspread.grib.idx' if os.path.exists(idx_file): os.remove(idx_file) ds = xr.open_dataset(file, engine="cfgrib", filter_by_keys={'dataType': 'an'})强制禁用索引缓存:在
open_dataset中添加backend_kwargs={'indexpath': None},让cfgrib每次打开文件都重新扫描生成索引,不依赖缓存:ds = xr.open_dataset( file, engine="cfgrib", filter_by_keys={'dataType': 'an'}, backend_kwargs={'indexpath': None} )验证文件完整性:用ecCodes工具(比如
grib_ls)检查GRIB文件是否存在损坏的消息:grib_ls C:/Users/Downloads/04292020_rean_ensmean_ensspread.grib如果工具无法列出部分时间的消息,说明文件确实损坏,需要重新获取。
升级依赖库:旧版本的cfgrib或ecCodes可能存在大文件处理bug,尝试升级到最新版本:
pip install --upgrade cfgrib eccodes分块加载数据:用
chunks参数分块读取,避免一次性读取大文件引发的偏移错误:ds = xr.open_dataset( file, engine="cfgrib", filter_by_keys={'dataType': 'an'}, chunks={'time': 1, 'isobaricInhPa': 5} ) # 按需加载指定部分的数据 divergence = ds['d'].isel(time=13, latitude=slice(None, None, 2), longitude=slice(None, None, 2)).compute()
备注:内容来源于stack exchange,提问作者Researcher R




