55using Tensorflow . Keras . Utils ;
66using Tensorflow . NumPy ;
77using System . Linq ;
8+ using Google . Protobuf . Collections ;
9+ using Microsoft . VisualBasic ;
10+ using OneOf . Types ;
11+ using static HDF . PInvoke . H5 ;
12+ using System . Data ;
13+ using System . Reflection . Emit ;
14+ using System . Xml . Linq ;
815
916namespace Tensorflow . Keras . Datasets
1017{
1118 /// <summary>
1219 /// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
1320 /// (positive/negative). Reviews have been preprocessed, and each review is
1421 /// encoded as a list of word indexes(integers).
22+ /// For convenience, words are indexed by overall frequency in the dataset,
23+ /// so that for instance the integer "3" encodes the 3rd most frequent word in
24+ /// the data.This allows for quick filtering operations such as:
25+ /// "only consider the top 10,000 most
26+ /// common words, but eliminate the top 20 most common words".
27+ /// As a convention, "0" does not stand for a specific word, but instead is used
28+ /// to encode the pad token.
29+ /// Args:
30+ /// path: where to cache the data (relative to %TEMP%/imdb/imdb.npz).
31+ /// num_words: integer or None.Words are
32+ /// ranked by how often they occur(in the training set) and only
33+ /// the `num_words` most frequent words are kept.Any less frequent word
34+ /// will appear as `oov_char` value in the sequence data.If None,
35+ /// all words are kept.Defaults to `None`.
36+ /// skip_top: skip the top N most frequently occurring words
37+ /// (which may not be informative). These words will appear as
38+ /// `oov_char` value in the dataset.When 0, no words are
39+ /// skipped. Defaults to `0`.
40+ /// maxlen: int or None.Maximum sequence length.
41+ /// Any longer sequence will be truncated. None, means no truncation.
42+ /// Defaults to `None`.
43+ /// seed: int. Seed for reproducible data shuffling.
44+ /// start_char: int. The start of a sequence will be marked with this
45+ /// character. 0 is usually the padding character. Defaults to `1`.
46+ /// oov_char: int. The out-of-vocabulary character.
47+ /// Words that were cut out because of the `num_words` or
48+ /// `skip_top` limits will be replaced with this character.
49+ /// index_from: int. Index actual words with this index and higher.
50+ /// Returns:
51+ /// Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
52+ ///
53+ /// ** x_train, x_test**: lists of sequences, which are lists of indexes
54+ /// (integers). If the num_words argument was specific, the maximum
55+ /// possible index value is `num_words - 1`. If the `maxlen` argument was
56+ /// specified, the largest possible sequence length is `maxlen`.
57+ ///
58+ /// ** y_train, y_test**: lists of integer labels(1 or 0).
59+ ///
60+ /// Raises:
61+ /// ValueError: in case `maxlen` is so low
62+ /// that no input sequence could be kept.
63+ /// Note that the 'out of vocabulary' character is only used for
64+ /// words that were present in the training set but are not included
65+ /// because they're not making the `num_words` cut here.
66+ /// Words that were not seen in the training set but are in the test set
67+ /// have simply been skipped.
1568 /// </summary>
69+ /// """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
1670 public class Imdb
1771 {
1872 string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/" ;
1973 string file_name = "imdb.npz" ;
2074 string dest_folder = "imdb" ;
21-
2275 /// <summary>
2376 /// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
2477 /// </summary>
@@ -41,8 +94,10 @@ public DatasetPass load_data(string path = "imdb.npz",
4194 int index_from = 3 )
4295 {
4396 var dst = Download ( ) ;
44-
45- var lines = File . ReadAllLines ( Path . Combine ( dst , "imdb_train.txt" ) ) ;
97+ var fileBytes = File . ReadAllBytes ( Path . Combine ( dst , file_name ) ) ;
98+ var ( x_train , x_test ) = LoadX ( fileBytes ) ;
99+ var ( y_train , y_test ) = LoadY ( fileBytes ) ;
100+ /*var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt"));
46101 var x_train_string = new string[lines.Length];
47102 var y_train = np.zeros(new int[] { lines.Length }, np.int64);
48103 for (int i = 0; i < lines.Length; i++)
@@ -62,7 +117,7 @@ public DatasetPass load_data(string path = "imdb.npz",
62117 x_test_string[i] = lines[i].Substring(2);
63118 }
64119
65- var x_test = np . array ( x_test_string ) ;
120+ var x_test = np.array(x_test_string);*/
66121
67122 return new DatasetPass
68123 {
0 commit comments