pandas的dataframe节省内存
##############################第1種###################################################
[1]代碼如下(下面的會損失數(shù)據(jù)精度):
def memory_usage_mb(df, *args, **kwargs):"""Dataframe memory usage in MB. """return df.memory_usage(*args, **kwargs).sum() / 1024**2def reduce_mem_usage(df, verbose=True):numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']start_mem = df.memory_usage(deep=True).sum() / 1024**2for col in df.columns:col_type = df[col].dtypesif col_type in numerics:c_min = df[col].min()c_max = df[col].max()if str(col_type)[:3] == 'int':if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:df[col] = df[col].astype(np.int8)elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:df[col] = df[col].astype(np.int16)elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:df[col] = df[col].astype(np.int32)elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:df[col] = df[col].astype(np.int64)else:c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:df[col] = df[col].astype(np.float32)else:df[col] = df[col].astype(np.float64)end_mem = df.memory_usage().sum() / 1024**2if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))return df?
##############################第2種###################################################
[2]代碼如下(下面的不會損失數(shù)據(jù)精度):
? def memory_usage_mb(df, *args, **kwargs):"""Dataframe memory usage in MB. """return df.memory_usage(*args, **kwargs).sum() / 1024**2? def reduce_memory_usage(df, deep=True, verbose=True, categories=True):# All types that we want to change for "lighter" ones.# int8 and float16 are not include because we cannot reduce# those data types.# float32 is not include because float16 has too low precision.numeric2reduce = ["int16", "int32", "int64", "float64"]start_mem = 0if verbose:start_mem = memory_usage_mb(df, deep=deep)for col, col_type in df.dtypes.iteritems():best_type = Noneif col_type == "object":df[col] = df[col].astype("category")best_type = "category"elif col_type in numeric2reduce:downcast = "integer" if "int" in str(col_type) else "float"df[col] = pd.to_numeric(df[col], downcast=downcast)best_type = df[col].dtype.name# Log the conversion performed.if verbose and best_type is not None and best_type != str(col_type):print(f"Column '{col}' converted from {col_type} to {best_type}")if verbose:end_mem = memory_usage_mb(df, deep=deep)diff_mem = start_mem - end_mempercent_mem = 100 * diff_mem / start_memprint(f"Memory usage decreased from"f" {start_mem:.2f}MB to {end_mem:.2f}MB"f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")return df上面的代碼的bug是:
df[col] = df[col].astype("category")
會導(dǎo)致無法使用fillna函數(shù),直接報錯。因?yàn)閒illna主要是針對object對象的,不能直接針對category對象
所以填充工作必須在節(jié)省內(nèi)存的前面.
?
使用方法是:
import datatable as dttrain=dt.fread(folder+"train.csv") train=train.to_pandas()train = reduce_mem_usage(train)##############################第3種###################################################
代碼來自[3]
def get_stats(df):stats = pd.DataFrame(index=df.columns, columns=['na_count', 'n_unique', 'type', 'memory_usage'])for col in df.columns:stats.loc[col] = [df[col].isna().sum(), df[col].nunique(dropna=False), df[col].dtypes, df[col].memory_usage(deep=True, index=False) / 1024**2]stats.loc['Overall'] = [stats['na_count'].sum(), stats['n_unique'].sum(), None, df.memory_usage(deep=True).sum() / 1024**2]return statsdef print_header():print('col conversion dtype na uniq size')print()def print_values(name, conversion, col):template = '{:10} {:16} {:>7} {:2} {:6} {:1.2f}MB'print(template.format(name, conversion, str(col.dtypes), col.isna().sum(), col.nunique(dropna=False), col.memory_usage(deep=True, index=False) / 1024 ** 2))# safe downcast def sd(col, max_loss_limit=0.001, avg_loss_limit=0.001, na_loss_limit=0, n_uniq_loss_limit=0, fillna=0):"""max_loss_limit - don't allow any float to lose precision more than this value. Any values are ok for GBT algorithms as long as you don't unique values.See https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations_on_decimal_values_in_[0,_1]avg_loss_limit - same but calculates avg throughout the series.na_loss_limit - not really useful.n_uniq_loss_limit - very important parameter. If you have a float field with very high cardinality you can set this value to something like n_records * 0.01 in order to allow some field relaxing."""is_float = str(col.dtypes)[:5] == 'float'na_count = col.isna().sum()n_uniq = col.nunique(dropna=False)try_types = ['float16', 'float32']if na_count <= na_loss_limit:try_types = ['int8', 'int16', 'float16', 'int32', 'float32']for type in try_types:col_tmp = col# float to int conversion => try to round to minimize casting errorif is_float and (str(type)[:3] == 'int'):col_tmp = col_tmp.copy().fillna(fillna).round()col_tmp = col_tmp.astype(type)max_loss = (col_tmp - col).abs().max()avg_loss = (col_tmp - col).abs().mean()na_loss = np.abs(na_count - col_tmp.isna().sum())n_uniq_loss = np.abs(n_uniq - col_tmp.nunique(dropna=False))if max_loss <= max_loss_limit and avg_loss <= avg_loss_limit and na_loss <= na_loss_limit and n_uniq_loss <= n_uniq_loss_limit:return col_tmp# field can't be convertedreturn coldef reduce_mem_usage_sd(df, deep=True, verbose=False, obj_to_cat=False):numerics = ['int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64', 'float16', 'float32', 'float64']start_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2for col in df.columns:col_type = df[col].dtypes# collect statsna_count = df[col].isna().sum()n_uniq = df[col].nunique(dropna=False)# numericsif col_type in numerics:df[col] = sd(df[col])# stringsif (col_type == 'object') and obj_to_cat:df[col] = df[col].astype('category')if verbose:print(f'Column {col}: {col_type} -> {df[col].dtypes}, na_count={na_count}, n_uniq={n_uniq}')new_na_count = df[col].isna().sum()if (na_count != new_na_count):print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost na values. Before: {na_count}, after: {new_na_count}')new_n_uniq = df[col].nunique(dropna=False)if (n_uniq != new_n_uniq):print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost unique values. Before: {n_uniq}, after: {new_n_uniq}')end_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2percent = 100 * (start_mem - end_mem) / start_memif verbose:print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, percent))return df?
使用方法:
print("縮小前,train情況統(tǒng)計") stats = get_stats(train) print(stats) train= reduce_mem_usage_sd(train, verbose=True) print("縮小后,test情況統(tǒng)計") stats = get_stats(train) print(stats)?
代碼來自:
[1]https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
[2]https://www.kaggle.com/c/ieee-fraud-detection/discussion/107653#latest-619384
總結(jié)
以上是生活随笔為你收集整理的pandas的dataframe节省内存的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: sql中进行计算并重命名
- 下一篇: 快速筛选数据集中某列特征符合某种规律的所