Dataset之MNIST:MNIST(手写数字图片识别+ubyte.gz文件)数据集的下载(基于python语言根据爬虫技术自动下载MNIST数据集)
生活随笔
收集整理的這篇文章主要介紹了
Dataset之MNIST:MNIST(手写数字图片识别+ubyte.gz文件)数据集的下载(基于python语言根据爬虫技术自动下载MNIST数据集)
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
Dataset之MNIST:MNIST(手寫數(shù)字圖片識(shí)別+ubyte.gz文件)數(shù)據(jù)集的下載(基于python語言根據(jù)爬蟲技術(shù)自動(dòng)下載MNIST數(shù)據(jù)集)
?
?
目錄
數(shù)據(jù)集下載的所有代碼
1、主文件?mnist_download_main.py文件
2、mnist.py文件
3、dataset.py文件
4、cache.py
5、download.py文件
?
?
?
數(shù)據(jù)集下載的所有代碼
代碼打包地址:mnist數(shù)據(jù)集下載的完整代碼——mnist_download_main.rar
1、主文件?mnist_download_main.py文件
#1、讀取數(shù)據(jù)集 # MNIST數(shù)據(jù)集大約12MB,如果沒在指定的路徑中找到就會(huì)自動(dòng)下載。 from mnist import MNIST data = MNIST(data_dir="data/MNIST/") #它由70,000張圖像和對(duì)應(yīng)的標(biāo)簽(圖像的類別)組成。數(shù)據(jù)集分成三份互相獨(dú)立的子集。本教程中只用訓(xùn)練集和測試集。 print("Size of:") print("- Training-set:\t\t{}".format(data.num_train)) print("- Validation-set:\t{}".format(data.num_val)) print("- Test-set:\t\t{}".format(data.num_test))?
2、mnist.py文件
######################################################################## # # Downloads the MNIST data-set for recognizing hand-written digits. # # Implemented in Python 3.6 # # Usage: # 1) Create a new object instance: data = MNIST(data_dir="data/MNIST/") # This automatically downloads the files to the given dir. # 2) Use the training-set as data.x_train, data.y_train and data.y_train_cls # 3) Get random batches of training data using data.random_batch() # 4) Use the test-set as data.x_test, data.y_test and data.y_test_cls # ######################################################################## # # This file is part of the TensorFlow Tutorials available at: # # https://github.com/Hvass-Labs/TensorFlow-Tutorials # # Published under the MIT License. See the file LICENSE for details. # # Copyright 2016-18 by Magnus Erik Hvass Pedersen # ########################################################################import numpy as np import gzip import os from dataset import one_hot_encoded from download import download######################################################################### Base URL for downloading the data-files from the internet. base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"# Filenames for the data-set. filename_x_train = "train-images-idx3-ubyte.gz" filename_y_train = "train-labels-idx1-ubyte.gz" filename_x_test = "t10k-images-idx3-ubyte.gz" filename_y_test = "t10k-labels-idx1-ubyte.gz"########################################################################class MNIST:"""The MNIST data-set for recognizing hand-written digits.This automatically downloads the data-files if they donot already exist in the local data_dir.Note: Pixel-values are floats between 0.0 and 1.0."""# The images are 28 pixels in each dimension.img_size = 28# The images are stored in one-dimensional arrays of this length.img_size_flat = img_size * img_size# Tuple with height and width of images used to reshape arrays.img_shape = (img_size, img_size)# Number of colour channels for the images: 1 channel for gray-scale.num_channels = 1# Tuple with height, width and depth used to reshape arrays.# This is used for reshaping in Keras.img_shape_full = (img_size, img_size, num_channels)# Number of classes, one class for each of 10 digits.num_classes = 10def __init__(self, data_dir="data/MNIST/"):"""Load the MNIST data-set. Automatically downloads the filesif they do not already exist locally.:param data_dir: Base-directory for downloading files."""# Copy args to self.self.data_dir = data_dir# Number of images in each sub-set.self.num_train = 55000self.num_val = 5000self.num_test = 10000# Download / load the training-set.x_train = self._load_images(filename=filename_x_train)y_train_cls = self._load_cls(filename=filename_y_train)# Split the training-set into train / validation.# Pixel-values are converted from ints between 0 and 255# to floats between 0.0 and 1.0.self.x_train = x_train[0:self.num_train] / 255.0self.x_val = x_train[self.num_train:] / 255.0self.y_train_cls = y_train_cls[0:self.num_train]self.y_val_cls = y_train_cls[self.num_train:]# Download / load the test-set.self.x_test = self._load_images(filename=filename_x_test) / 255.0self.y_test_cls = self._load_cls(filename=filename_y_test)# Convert the class-numbers from bytes to ints as that is needed# some places in TensorFlow.self.y_train_cls = self.y_train_cls.astype(np.int)self.y_val_cls = self.y_val_cls.astype(np.int)self.y_test_cls = self.y_test_cls.astype(np.int)# Convert the integer class-numbers into one-hot encoded arrays.self.y_train = one_hot_encoded(class_numbers=self.y_train_cls,num_classes=self.num_classes)self.y_val = one_hot_encoded(class_numbers=self.y_val_cls,num_classes=self.num_classes)self.y_test = one_hot_encoded(class_numbers=self.y_test_cls,num_classes=self.num_classes)def _load_data(self, filename, offset):"""Load the data in the given file. Automatically downloads the fileif it does not already exist in the data_dir.:param filename: Name of the data-file.:param offset: Start offset in bytes when reading the data-file.:return: The data as a numpy array."""# Download the file from the internet if it does not exist locally.download(base_url=base_url, filename=filename, download_dir=self.data_dir)# Read the data-file.path = os.path.join(self.data_dir, filename)with gzip.open(path, 'rb') as f:data = np.frombuffer(f.read(), np.uint8, offset=offset)return datadef _load_images(self, filename):"""Load image-data from the given file.Automatically downloads the file if it does not exist locally.:param filename: Name of the data-file.:return: Numpy array."""# Read the data as one long array of bytes.data = self._load_data(filename=filename, offset=16)# Reshape to 2-dim array with shape (num_images, img_size_flat).images_flat = data.reshape(-1, self.img_size_flat)return images_flatdef _load_cls(self, filename):"""Load class-numbers from the given file.Automatically downloads the file if it does not exist locally.:param filename: Name of the data-file.:return: Numpy array."""return self._load_data(filename=filename, offset=8)def random_batch(self, batch_size=32):"""Create a random batch of training-data.:param batch_size: Number of images in the batch.:return: 3 numpy arrays (x, y, y_cls)"""# Create a random index into the training-set.idx = np.random.randint(low=0, high=self.num_train, size=batch_size)# Use the index to lookup random training-data.x_batch = self.x_train[idx]y_batch = self.y_train[idx]y_batch_cls = self.y_train_cls[idx]return x_batch, y_batch, y_batch_cls########################################################################?
3、dataset.py文件
######################################################################## # # Class for creating a data-set consisting of all files in a directory. # # Example usage is shown in the file knifey.py and Tutorial #09. # # Implemented in Python 3.5 # ######################################################################## # # This file is part of the TensorFlow Tutorials available at: # # https://github.com/Hvass-Labs/TensorFlow-Tutorials # # Published under the MIT License. See the file LICENSE for details. # # Copyright 2016 by Magnus Erik Hvass Pedersen # ########################################################################import numpy as np import os import shutil from cache import cache########################################################################def one_hot_encoded(class_numbers, num_classes=None):"""Generate the One-Hot encoded class-labels from an array of integers.For example, if class_number=2 and num_classes=4 thenthe one-hot encoded label is the float array: [0. 0. 1. 0.]:param class_numbers:Array of integers with class-numbers.Assume the integers are from zero to num_classes-1 inclusive.:param num_classes:Number of classes. If None then use max(class_numbers)+1.:return:2-dim array of shape: [len(class_numbers), num_classes]"""# Find the number of classes if None is provided.# Assumes the lowest class-number is zero.if num_classes is None:num_classes = np.max(class_numbers) + 1return np.eye(num_classes, dtype=float)[class_numbers]########################################################################class DataSet:def __init__(self, in_dir, exts='.jpg'):"""Create a data-set consisting of the filenames in the given directoryand sub-dirs that match the given filename-extensions.For example, the knifey-spoony data-set (see knifey.py) has thefollowing dir-structure:knifey-spoony/forky/knifey-spoony/knifey/knifey-spoony/spoony/knifey-spoony/forky/test/knifey-spoony/knifey/test/knifey-spoony/spoony/test/This means there are 3 classes called: forky, knifey, and spoony.If we set in_dir = "knifey-spoony/" and create a new DataSet-objectthen it will scan through these directories and create a training-setand test-set for each of these classes.The training-set will contain a list of all the *.jpg filenamesin the following directories:knifey-spoony/forky/knifey-spoony/knifey/knifey-spoony/spoony/The test-set will contain a list of all the *.jpg filenamesin the following directories:knifey-spoony/forky/test/knifey-spoony/knifey/test/knifey-spoony/spoony/test/See the TensorFlow Tutorial #09 for a usage example.:param in_dir:Root-dir for the files in the data-set.This would be 'knifey-spoony/' in the example above.:param exts:String or tuple of strings with valid filename-extensions.Not case-sensitive.:return:Object instance."""# Extend the input directory to the full path.in_dir = os.path.abspath(in_dir)# Input directory.self.in_dir = in_dir# Convert all file-extensions to lower-case.self.exts = tuple(ext.lower() for ext in exts)# Names for the classes.self.class_names = []# Filenames for all the files in the training-set.self.filenames = []# Filenames for all the files in the test-set.self.filenames_test = []# Class-number for each file in the training-set.self.class_numbers = []# Class-number for each file in the test-set.self.class_numbers_test = []# Total number of classes in the data-set.self.num_classes = 0# For all files/dirs in the input directory.for name in os.listdir(in_dir):# Full path for the file / dir.current_dir = os.path.join(in_dir, name)# If it is a directory.if os.path.isdir(current_dir):# Add the dir-name to the list of class-names.self.class_names.append(name)# Training-set.# Get all the valid filenames in the dir (not sub-dirs).filenames = self._get_filenames(current_dir)# Append them to the list of all filenames for the training-set.self.filenames.extend(filenames)# The class-number for this class.class_number = self.num_classes# Create an array of class-numbers.class_numbers = [class_number] * len(filenames)# Append them to the list of all class-numbers for the training-set.self.class_numbers.extend(class_numbers)# Test-set.# Get all the valid filenames in the sub-dir named 'test'.filenames_test = self._get_filenames(os.path.join(current_dir, 'test'))# Append them to the list of all filenames for the test-set.self.filenames_test.extend(filenames_test)# Create an array of class-numbers.class_numbers = [class_number] * len(filenames_test)# Append them to the list of all class-numbers for the test-set.self.class_numbers_test.extend(class_numbers)# Increase the total number of classes in the data-set.self.num_classes += 1def _get_filenames(self, dir):"""Create and return a list of filenames with matching extensions in the given directory.:param dir:Directory to scan for files. Sub-dirs are not scanned.:return:List of filenames. Only filenames. Does not include the directory."""# Initialize empty list.filenames = []# If the directory exists.if os.path.exists(dir):# Get all the filenames with matching extensions.for filename in os.listdir(dir):if filename.lower().endswith(self.exts):filenames.append(filename)return filenamesdef get_paths(self, test=False):"""Get the full paths for the files in the data-set.:param test:Boolean. Return the paths for the test-set (True) or training-set (False).:return:Iterator with strings for the path-names."""if test:# Use the filenames and class-numbers for the test-set.filenames = self.filenames_testclass_numbers = self.class_numbers_test# Sub-dir for test-set.test_dir = "test/"else:# Use the filenames and class-numbers for the training-set.filenames = self.filenamesclass_numbers = self.class_numbers# Don't use a sub-dir for test-set.test_dir = ""for filename, cls in zip(filenames, class_numbers):# Full path-name for the file.path = os.path.join(self.in_dir, self.class_names[cls], test_dir, filename)yield pathdef get_training_set(self):"""Return the list of paths for the files in the training-set,and the list of class-numbers as integers,and the class-numbers as one-hot encoded arrays."""return list(self.get_paths()), \np.asarray(self.class_numbers), \one_hot_encoded(class_numbers=self.class_numbers,num_classes=self.num_classes)def get_test_set(self):"""Return the list of paths for the files in the test-set,and the list of class-numbers as integers,and the class-numbers as one-hot encoded arrays."""return list(self.get_paths(test=True)), \np.asarray(self.class_numbers_test), \one_hot_encoded(class_numbers=self.class_numbers_test,num_classes=self.num_classes)def copy_files(self, train_dir, test_dir):"""Copy all the files in the training-set to train_dirand copy all the files in the test-set to test_dir.For example, the normal directory structure for thedifferent classes in the training-set is:knifey-spoony/forky/knifey-spoony/knifey/knifey-spoony/spoony/Normally the test-set is a sub-dir of the training-set:knifey-spoony/forky/test/knifey-spoony/knifey/test/knifey-spoony/spoony/test/But some APIs use another dir-structure for the training-set:knifey-spoony/train/forky/knifey-spoony/train/knifey/knifey-spoony/train/spoony/and for the test-set:knifey-spoony/test/forky/knifey-spoony/test/knifey/knifey-spoony/test/spoony/:param train_dir: Directory for the training-set e.g. 'knifey-spoony/train/':param test_dir: Directory for the test-set e.g. 'knifey-spoony/test/':return: Nothing. """# Helper-function for actually copying the files.def _copy_files(src_paths, dst_dir, class_numbers):# Create a list of dirs for each class, e.g.:# ['knifey-spoony/test/forky/',# 'knifey-spoony/test/knifey/',# 'knifey-spoony/test/spoony/']class_dirs = [os.path.join(dst_dir, class_name + "/")for class_name in self.class_names]# Check if each class-directory exists, otherwise create it.for dir in class_dirs:if not os.path.exists(dir):os.makedirs(dir)# For all the file-paths and associated class-numbers,# copy the file to the destination dir for that class.for src, cls in zip(src_paths, class_numbers):shutil.copy(src=src, dst=class_dirs[cls])# Copy the files for the training-set._copy_files(src_paths=self.get_paths(test=False),dst_dir=train_dir,class_numbers=self.class_numbers)print("- Copied training-set to:", train_dir)# Copy the files for the test-set._copy_files(src_paths=self.get_paths(test=True),dst_dir=test_dir,class_numbers=self.class_numbers_test)print("- Copied test-set to:", test_dir)########################################################################def load_cached(cache_path, in_dir):"""Wrapper-function for creating a DataSet-object, which will beloaded from a cache-file if it already exists, otherwise a newobject will be created and saved to the cache-file.This is useful if you need to ensure the ordering of thefilenames is consistent every time you load the data-set,for example if you use the DataSet-object in combinationwith Transfer Values saved to another cache-file, see e.g.Tutorial #09 for an example of this.:param cache_path:File-path for the cache-file.:param in_dir:Root-dir for the files in the data-set.This is an argument for the DataSet-init function.:return:The DataSet-object."""print("Creating dataset from the files in: " + in_dir)# If the object-instance for DataSet(in_dir=data_dir) already# exists in the cache-file then reload it, otherwise create# an object instance and save it to the cache-file for next time.dataset = cache(cache_path=cache_path,fn=DataSet, in_dir=in_dir)return dataset########################################################################?
4、cache.py
######################################################################## # # Cache-wrapper for a function or class. # # Save the result of calling a function or creating an object-instance # to harddisk. This is used to persist the data so it can be reloaded # very quickly and easily. # # Implemented in Python 3.5 # ######################################################################## # # This file is part of the TensorFlow Tutorials available at: # # https://github.com/Hvass-Labs/TensorFlow-Tutorials # # Published under the MIT License. See the file LICENSE for details. # # Copyright 2016 by Magnus Erik Hvass Pedersen # ########################################################################import os import pickle import numpy as np########################################################################def cache(cache_path, fn, *args, **kwargs):"""Cache-wrapper for a function or class. If the cache-file existsthen the data is reloaded and returned, otherwise the functionis called and the result is saved to cache. The fn-argument canalso be a class instead, in which case an object-instance iscreated and saved to the cache-file.:param cache_path:File-path for the cache-file.:param fn:Function or class to be called.:param args:Arguments to the function or class-init.:param kwargs:Keyword arguments to the function or class-init.:return:The result of calling the function or creating the object-instance."""# If the cache-file exists.if os.path.exists(cache_path):# Load the cached data from the file.with open(cache_path, mode='rb') as file:obj = pickle.load(file)print("- Data loaded from cache-file: " + cache_path)else:# The cache-file does not exist.# Call the function / class-init with the supplied arguments.obj = fn(*args, **kwargs)# Save the data to a cache-file.with open(cache_path, mode='wb') as file:pickle.dump(obj, file)print("- Data saved to cache-file: " + cache_path)return obj########################################################################def convert_numpy2pickle(in_path, out_path):"""Convert a numpy-file to pickle-file.The first version of the cache-function used numpy for saving the data.Instead of re-calculating all the data, you can just convert thecache-file using this function.:param in_path:Input file in numpy-format written using numpy.save().:param out_path:Output file written as a pickle-file.:return:Nothing."""# Load the data using numpy.data = np.load(in_path)# Save the data using pickle.with open(out_path, mode='wb') as file:pickle.dump(data, file)########################################################################if __name__ == '__main__':# This is a short example of using a cache-file.# This is the function that will only get called if the result# is not already saved in the cache-file. This would normally# be a function that takes a long time to compute, or if you# need persistent data for some other reason.def expensive_function(a, b):return a * bprint('Computing expensive_function() ...')# Either load the result from a cache-file if it already exists,# otherwise calculate expensive_function(a=123, b=456) and# save the result to the cache-file for next time.result = cache(cache_path='cache_expensive_function.pkl',fn=expensive_function, a=123, b=456)print('result =', result)# Newline.print()# This is another example which saves an object to a cache-file.# We want to cache an object-instance of this class.# The motivation is to do an expensive computation only once,# or if we need to persist the data for some other reason.class ExpensiveClass:def __init__(self, c, d):self.c = cself.d = dself.result = c * ddef print_result(self):print('c =', self.c)print('d =', self.d)print('result = c * d =', self.result)print('Creating object from ExpensiveClass() ...')# Either load the object from a cache-file if it already exists,# otherwise make an object-instance ExpensiveClass(c=123, d=456)# and save the object to the cache-file for the next time.obj = cache(cache_path='cache_ExpensiveClass.pkl',fn=ExpensiveClass, c=123, d=456)obj.print_result()########################################################################?
5、download.py文件
######################################################################## # # Functions for downloading and extracting data-files from the internet. # # Implemented in Python 3.5 # ######################################################################## # # This file is part of the TensorFlow Tutorials available at: # # https://github.com/Hvass-Labs/TensorFlow-Tutorials # # Published under the MIT License. See the file LICENSE for details. # # Copyright 2016 by Magnus Erik Hvass Pedersen # ########################################################################import sys import os import urllib.request import tarfile import zipfile########################################################################def _print_download_progress(count, block_size, total_size):"""Function used for printing the download progress.Used as a call-back function in maybe_download_and_extract()."""# Percentage completion.pct_complete = float(count * block_size) / total_size# Limit it because rounding errors may cause it to exceed 100%.pct_complete = min(1.0, pct_complete)# Status-message. Note the \r which means the line should overwrite itself.msg = "\r- Download progress: {0:.1%}".format(pct_complete)# Print it.sys.stdout.write(msg)sys.stdout.flush()########################################################################def download(base_url, filename, download_dir):"""Download the given file if it does not already exist in the download_dir.:param base_url: The internet URL without the filename.:param filename: The filename that will be added to the base_url.:param download_dir: Local directory for storing the file.:return: Nothing."""# Path for local file.save_path = os.path.join(download_dir, filename)# Check if the file already exists, otherwise we need to download it now.if not os.path.exists(save_path):# Check if the download directory exists, otherwise create it.if not os.path.exists(download_dir):os.makedirs(download_dir)print("Downloading", filename, "...")# Download the file from the internet.url = base_url + filenamefile_path, _ = urllib.request.urlretrieve(url=url,filename=save_path,reporthook=_print_download_progress)print(" Done!")def maybe_download_and_extract(url, download_dir):"""Download and extract the data if it doesn't already exist.Assumes the url is a tar-ball file.:param url:Internet URL for the tar-file to download.Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz":param download_dir:Directory where the downloaded file is saved.Example: "data/CIFAR-10/":return:Nothing."""# Filename for saving the file downloaded from the internet.# Use the filename from the URL and add it to the download_dir.filename = url.split('/')[-1]file_path = os.path.join(download_dir, filename)# Check if the file already exists.# If it exists then we assume it has also been extracted,# otherwise we need to download and extract it now.if not os.path.exists(file_path):# Check if the download directory exists, otherwise create it.if not os.path.exists(download_dir):os.makedirs(download_dir)# Download the file from the internet.file_path, _ = urllib.request.urlretrieve(url=url,filename=file_path,reporthook=_print_download_progress)print()print("Download finished. Extracting files.")if file_path.endswith(".zip"):# Unpack the zip-file.zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)elif file_path.endswith((".tar.gz", ".tgz")):# Unpack the tar-ball.tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)print("Done.")else:print("Data has apparently already been downloaded and unpacked.")########################################################################?
?
總結(jié)
以上是生活随笔為你收集整理的Dataset之MNIST:MNIST(手写数字图片识别+ubyte.gz文件)数据集的下载(基于python语言根据爬虫技术自动下载MNIST数据集)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Paper之DL:深度学习高质量论文分类
- 下一篇: ML之HMM:HMM算法相关论文、关键步