|
| 1 | +using ICSharpCode.SharpZipLib.Core; |
| 2 | +using ICSharpCode.SharpZipLib.GZip; |
| 3 | +using NumSharp.Core; |
| 4 | +using System; |
| 5 | +using System.Collections.Generic; |
| 6 | +using System.IO; |
| 7 | +using System.Linq; |
| 8 | +using System.Text; |
| 9 | +using Tensorflow; |
| 10 | + |
| 11 | +namespace TensorFlowNET.Examples.Utility |
| 12 | +{ |
| 13 | + public class MnistDataSet |
| 14 | + { |
| 15 | + private const string DEFAULT_SOURCE_URL = "https://storage.googleapis.com/cvdf-datasets/mnist/"; |
| 16 | + private const string TRAIN_IMAGES = "train-images-idx3-ubyte.gz"; |
| 17 | + private const string TRAIN_LABELS = "train-labels-idx1-ubyte.gz"; |
| 18 | + private const string TEST_IMAGES = "t10k-images-idx3-ubyte.gz"; |
| 19 | + private const string TEST_LABELS = "t10k-labels-idx1-ubyte.gz"; |
| 20 | + |
| 21 | + public static void read_data_sets(string train_dir, |
| 22 | + bool one_hot = false, |
| 23 | + TF_DataType dtype = TF_DataType.DtInvalid, |
| 24 | + bool reshape = true, |
| 25 | + int validation_size = 5000, |
| 26 | + string source_url = DEFAULT_SOURCE_URL) |
| 27 | + { |
| 28 | + Web.Download(source_url + TRAIN_IMAGES, train_dir, TRAIN_IMAGES); |
| 29 | + Compress.ExtractGZip(Path.Join(train_dir, TRAIN_IMAGES), train_dir); |
| 30 | + var train_images = extract_images(Path.Join(train_dir, TRAIN_IMAGES.Split('.')[0])); |
| 31 | + |
| 32 | + Web.Download(source_url + TRAIN_LABELS, train_dir, TRAIN_LABELS); |
| 33 | + Compress.ExtractGZip(Path.Join(train_dir, TRAIN_LABELS), train_dir); |
| 34 | + var train_labels = extract_labels(Path.Join(train_dir, TRAIN_LABELS.Split('.')[0]), one_hot: one_hot); |
| 35 | + |
| 36 | + Web.Download(source_url + TEST_IMAGES, train_dir, TEST_IMAGES); |
| 37 | + Compress.ExtractGZip(Path.Join(train_dir, TEST_IMAGES), train_dir); |
| 38 | + var test_images = extract_images(Path.Join(train_dir, TEST_IMAGES.Split('.')[0])); |
| 39 | + |
| 40 | + Web.Download(source_url + TEST_LABELS, train_dir, TEST_LABELS); |
| 41 | + Compress.ExtractGZip(Path.Join(train_dir, TEST_LABELS), train_dir); |
| 42 | + var test_labels = extract_labels(Path.Join(train_dir, TEST_LABELS.Split('.')[0]), one_hot: one_hot); |
| 43 | + |
| 44 | + int end = train_images.shape[0]; |
| 45 | + var validation_images = train_images[np.arange(validation_size)]; |
| 46 | + var validation_labels = train_labels[np.arange(validation_size)]; |
| 47 | + train_images = train_images[np.arange(validation_size, end)]; |
| 48 | + train_labels = train_labels[np.arange(validation_size, end)]; |
| 49 | + |
| 50 | + var train = new DataSet(train_images, train_labels, dtype, reshape); |
| 51 | + } |
| 52 | + |
| 53 | + public static NDArray extract_images(string file) |
| 54 | + { |
| 55 | + using (var bytestream = new FileStream(file, FileMode.Open)) |
| 56 | + { |
| 57 | + var magic = _read32(bytestream); |
| 58 | + if (magic != 2051) |
| 59 | + throw new ValueError($"Invalid magic number {magic} in MNIST image file: {file}"); |
| 60 | + var num_images = _read32(bytestream); |
| 61 | + var rows = _read32(bytestream); |
| 62 | + var cols = _read32(bytestream); |
| 63 | + var buf = new byte[rows * cols * num_images]; |
| 64 | + bytestream.Read(buf, 0, buf.Length); |
| 65 | + var data = np.frombuffer(buf, np.uint8); |
| 66 | + data = data.reshape((int)num_images, (int)rows, (int)cols, 1); |
| 67 | + return data; |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + public static NDArray extract_labels(string file, bool one_hot = false, int num_classes = 10) |
| 72 | + { |
| 73 | + using (var bytestream = new FileStream(file, FileMode.Open)) |
| 74 | + { |
| 75 | + var magic = _read32(bytestream); |
| 76 | + if (magic != 2049) |
| 77 | + throw new ValueError($"Invalid magic number {magic} in MNIST label file: {file}"); |
| 78 | + var num_items = _read32(bytestream); |
| 79 | + var buf = new byte[num_items]; |
| 80 | + bytestream.Read(buf, 0, buf.Length); |
| 81 | + var labels = np.frombuffer(buf, np.uint8); |
| 82 | + if (one_hot) |
| 83 | + return dense_to_one_hot(labels, num_classes); |
| 84 | + return labels; |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + private static NDArray dense_to_one_hot(NDArray labels_dense, int num_classes) |
| 89 | + { |
| 90 | + var num_labels = labels_dense.shape[0]; |
| 91 | + var index_offset = np.arange(num_labels) * num_classes; |
| 92 | + var labels_one_hot = np.zeros(num_labels, num_classes); |
| 93 | + |
| 94 | + for(int row = 0; row < num_labels; row++) |
| 95 | + { |
| 96 | + var col = labels_dense.Data<byte>(row); |
| 97 | + labels_one_hot[row, col] = 1; |
| 98 | + } |
| 99 | + |
| 100 | + return labels_one_hot; |
| 101 | + } |
| 102 | + |
| 103 | + private static uint _read32(FileStream bytestream) |
| 104 | + { |
| 105 | + var buffer = new byte[sizeof(uint)]; |
| 106 | + var count = bytestream.Read(buffer, 0, 4); |
| 107 | + return np.frombuffer(buffer, ">u4").Data<uint>(0); |
| 108 | + } |
| 109 | + } |
| 110 | +} |
0 commit comments