add helper stuff

hamelsmu · hamelsmu · commit dca3348f322a · 2018-01-15T23:08:10.000-08:00
diff --git a/notebooks/seq2seq_utils.py b/notebooks/seq2seq_utils.py
@@ -0,0 +1,376 @@
+from matplotlib import pyplot as plt
+import tensorflow as tf
+from keras import backend as K
+from keras.layers import Input
+from keras.models import Model
+from IPython.display import SVG, display
+from keras.utils.vis_utils import model_to_dot
+import logging
+import numpy as np
+import dill as dpickle
+from annoy import AnnoyIndex
+from tqdm import tqdm
+from random import random
+
+
+def load_text_processor(fname='title_pp.dpkl'):
+    """
+    Load preprocessors from disk.
+
+    Parameters
+    ----------
+    fname: str
+        file name of ktext.proccessor object
+
+    Returns
+    -------
+    num_tokens : int
+        size of vocabulary loaded into ktext.processor
+    pp : ktext.processor
+        the processor you are trying to load
+
+    Typical Usage:
+    -------------
+
+    num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
+    num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
+
+    """
+    # Load files from disk
+    with open(fname, 'rb') as f:
+        pp = dpickle.load(f)
+
+    num_tokens = max(pp.id2token.keys()) + 1
+    print(f'Size of vocabulary for {fname}: {num_tokens:,}')
+    return num_tokens, pp
+
+
+def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
+    """
+    Load decoder inputs.
+
+    Parameters
+    ----------
+    decoder_np_vecs : str
+        filename of serialized numpy.array of decoder input (issue title)
+
+    Returns
+    -------
+    decoder_input_data : numpy.array
+        The data fed to the decoder as input during training for teacher forcing.
+        This is the same as `decoder_np_vecs` except the last position.
+    decoder_target_data : numpy.array
+        The data that the decoder data is trained to generate (issue title).
+        Calculated by sliding `decoder_np_vecs` one position forward.
+
+    """
+    vectorized_title = np.load(decoder_np_vecs)
+    # For Decoder Input, you don't need the last word as that is only for prediction
+    # when we are training using Teacher Forcing.
+    decoder_input_data = vectorized_title[:, :-1]
+
+    # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
+    decoder_target_data = vectorized_title[:, 1:]
+
+    print(f'Shape of decoder input: {decoder_input_data.shape}')
+    print(f'Shape of decoder target: {decoder_target_data.shape}')
+    return decoder_input_data, decoder_target_data
+
+
+def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
+    """
+    Load variables & data that are inputs to encoder.
+
+    Parameters
+    ----------
+    encoder_np_vecs : str
+        filename of serialized numpy.array of encoder input (issue title)
+
+    Returns
+    -------
+    encoder_input_data : numpy.array
+        The issue body
+    doc_length : int
+        The standard document length of the input for the encoder after padding
+        the shape of this array will be (num_examples, doc_length)
+
+    """
+    vectorized_body = np.load(encoder_np_vecs)
+    # Encoder input is simply the body of the issue text
+    encoder_input_data = vectorized_body
+    doc_length = encoder_input_data.shape[1]
+    print(f'Shape of encoder input: {encoder_input_data.shape}')
+    return encoder_input_data, doc_length
+
+
+def viz_model_architecture(model):
+    """Visualize model architecture in Jupyter notebook."""
+    display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
+
+
+def free_gpu_mem():
+    """Attempt to free gpu memory."""
+    K.get_session().close()
+    cfg = K.tf.ConfigProto()
+    cfg.gpu_options.allow_growth = True
+    K.set_session(K.tf.Session(config=cfg))
+
+
+def test_gpu():
+    """Run a toy computation task in tensorflow to test GPU."""
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    session = tf.Session(config=config)
+    hello = tf.constant('Hello, TensorFlow!')
+    print(session.run(hello))
+
+
+def plot_model_training_history(history_object):
+    """Plots model train vs. validation loss."""
+    plt.title('model accuracy')
+    plt.ylabel('accuracy')
+    plt.xlabel('epoch')
+    plt.plot(history_object.history['loss'])
+    plt.plot(history_object.history['val_loss'])
+    plt.legend(['train', 'test'], loc='upper left')
+    plt.show()
+
+
+def extract_encoder_model(model):
+    """
+    Extract the encoder from the original Sequence to Sequence Model.
+
+    Returns a keras model object that has one input (body of issue) and one
+    output (encoding of issue, which is the last hidden state).
+
+    Input:
+    -----
+    model: keras model object
+
+    Returns:
+    -----
+    keras model object
+
+    """
+    encoder_model = model.get_layer('Encoder-Model')
+    return encoder_model
+
+
+def extract_decoder_model(model):
+    """
+    Extract the decoder from the original model.
+
+    Inputs:
+    ------
+    model: keras model object
+
+    Returns:
+    -------
+    A Keras model object with the following inputs and outputs:
+
+    Inputs:
+    1: the embedding index for the last predicted word, or the <Start> indicator
+    2: the last hidden state, or in the case of the first word the hidden state from the encoder
+
+    Outputs:
+    1.  Prediction (class probabilities) for the next word
+    2.  The hidden state of the decoder, to be fed back into the decoder at the next time step
+
+    Implementation Notes:
+    ----------------------
+    Must extract relevant layers and reconstruct part of the computation graph
+    to allow for different inputs as we are not going to use teacher forcing at
+    inference time.
+
+    """
+    # the latent dimension is the same throughout the architecture so we are going to
+    # cheat and grab the latent dimension of the embedding because that is the same as what is
+    # output from the decoder
+    latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
+
+    # Reconstruct the input into the decoder
+    decoder_inputs = model.get_layer('Decoder-Input').input
+    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
+    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
+
+    # Instead of setting the intial state from the encoder and forgetting about it, during inference
+    # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
+    # the GRU, thus we define this input layer for the state so we can add this capability
+    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
+
+    # we need to reuse the weights that is why we are getting this
+    # If you inspect the decoder GRU that we created for training, it will take as input
+    # 2 tensors -> (1) is the embedding layer output for the teacher forcing
+    #                  (which will now be the last step's prediction, and will be _start_ on the first time step)
+    #              (2) is the state, which we will initialize with the encoder on the first time step, but then
+    #                   grab the state after the first prediction and feed that back in again.
+    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
+
+    # Reconstruct dense layers
+    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
+    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
+    decoder_model = Model([decoder_inputs, gru_inference_state_input],
+                          [dense_out, gru_state_out])
+    return decoder_model
+
+
+class Seq2Seq_Inference(object):
+    def __init__(self,
+                 encoder_preprocessor,
+                 decoder_preprocessor,
+                 seq2seq_model):
+
+        self.pp_body = encoder_preprocessor
+        self.pp_title = decoder_preprocessor
+        self.seq2seq_model = seq2seq_model
+        self.encoder_model = extract_encoder_model(seq2seq_model)
+        self.decoder_model = extract_decoder_model(seq2seq_model)
+        self.default_max_len_title = self.pp_title.padding_maxlen
+        self.nn = None
+        self.rec_df = None
+
+    def generate_issue_title(self,
+                             raw_input_text,
+                             max_len_title=None):
+        """
+        Use the seq2seq model to generate a title given the body of an issue.
+
+        Inputs
+        ------
+        raw_input: str
+            The body of the issue text as an input string
+
+        max_len_title: int (optional)
+            The maximum length of the title the model will generate
+
+        """
+        if max_len_title is None:
+            max_len_title = self.default_max_len_title
+        # Seed For _start_ token
+        raw_tokenized = self.pp_body.transform([raw_input_text])
+        body_encoding = self.encoder_model.predict(raw_tokenized)
+        # we want to save the encoder's embedding before its updated by decoder
+        #   because we can use that as an embedding for the enocder's input
+        #   (the issue body)
+        original_body_encoding = body_encoding
+        state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
+
+        decoded_sentence = []
+        stop_condition = False
+        while not stop_condition:
+            preds, st = self.decoder_model.predict([state_value, body_encoding])
+
+            # We are going to ignore indices 0 (padding) and indices 1 (unknown)
+            # Argmax will return the integer index corresponding to the
+            #  prediction + 2 b/c we chopped off first two
+            pred_idx = np.argmax(preds[:, :, 2:]) + 2
+
+            # retrieve word from index prediction
+            pred_word_str = self.pp_title.id2token[pred_idx]
+
+            if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
+                stop_condition = True
+                break
+            decoded_sentence.append(pred_word_str)
+
+            # update the decoder for the next word
+            body_encoding = st
+            state_value = np.array(pred_idx).reshape(1, 1)
+
+        return original_body_encoding, ' '.join(decoded_sentence)
+
+
+    def print_example(self,
+                      i,
+                      body_text,
+                      title_text,
+                      url,
+                      threshold):
+        """
+        Prints an example of the model's prediction for manual inspection.
+        """
+
+        print('\n\n==============================================')
+        print(f'============== Example # {i} =================\n')
+        print(url)
+        print(f"Issue Body:\n {body_text} \n")
+
+        print(f"Original Title:\n {title_text}")
+
+        emb, gen_title = self.generate_issue_title(body_text)
+        print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
+
+        if self.nn:
+            # return neighbors and distances
+            n, d = self.nn.get_nns_by_vector(emb.flatten(), n=3,
+                                             include_distances=True)
+            neighbors = n[1:]
+            dist = d[1:]
+
+            if min(dist) <= threshold:
+                cols = ['issue_url', 'issue_title', 'body']
+                dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
+                dfcopy['dist'] = dist
+                similar_issues_df = dfcopy.query(f'dist <= {threshold}')
+
+                print("\n**** Similar Issues (using encoder embedding) ****:\n")
+                display(similar_issues_df)
+
+
+    def demo_model_predictions(self,
+                               n,
+                               issue_df,
+                               threshold=1):
+        """
+        Pick n random Issues and display predictions.
+
+        Input:
+        ------
+        n : int
+            Number of issues to display from issue_df
+        issue_df : pandas DataFrame
+            DataFrame that contains two columns: `body` and `issue_title`.
+        threshold : float
+            distance threshold for recommendation of similar issues.
+
+        Returns:
+        --------
+        None
+            Prints the original issue body and the model's prediction.
+        """
+        # Extract body and title from DF
+        body_text = issue_df.body.tolist()
+        title_text = issue_df.issue_title.tolist()
+        url = issue_df.issue_url.tolist()
+
+        demo_list = np.random.randint(low=1, high=len(body_text), size=n)
+        for i in demo_list:
+            self.print_example(i,
+                               body_text=body_text[i],
+                               title_text=title_text[i],
+                               url=url[i],
+                               threshold=threshold)
+
+    def prepare_recommender(self, vectorized_array, original_df):
+        raise NotImplementedError
+        # TODO: verify vectorized_array == original_df
+        self.rec_df = original_df
+        emb = self.encoder_model.predict(x=vectorized_array,
+                                         batch_size=vectorized_array.shape[0]//100)
+
+        f = emb.shape[1]
+        self.nn = AnnoyIndex(f)
+        logging.warning('Adding embeddings')
+        for i in tqdm(range(len(emb))):
+            self.nn.add_item(i, emb[i])
+        logging.warning('Building trees for similarity lookup.')
+        self.nn.build(80)
+        return self.nn
+
+    def set_recsys_data(self, original_df):
+        raise NotImplementedError
+        self.rec_df = original_df
+
+    def set_recsys_annoyobj(self, annoyobj):
+        raise NotImplementedError
+        self.nn = annoyobj