matlab-deep-learning
diff --git a/‎+bert/+internal/convertModelNameToDirectories.m‎
Lines changed: 30 additions & 0 deletions b/‎+bert/+internal/convertModelNameToDirectories.m‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎+bert/+internal/createParameterStruct.m‎
Lines changed: 116 additions & 0 deletions b/‎+bert/+internal/createParameterStruct.m‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎+bert/+internal/encodeWithMaskToken.m‎
Lines changed: 33 additions & 0 deletions b/‎+bert/+internal/encodeWithMaskToken.m‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎+bert/+internal/getSupportFilePath.m‎
Lines changed: 19 additions & 0 deletions b/‎+bert/+internal/getSupportFilePath.m‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎+bert/+internal/inferTypeID.m‎
Lines changed: 17 additions & 0 deletions b/‎+bert/+internal/inferTypeID.m‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎+bert/+internal/predictMaskedToken.m‎
Lines changed: 14 additions & 0 deletions b/‎+bert/+internal/predictMaskedToken.m‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎+bert/+layer/block.m‎
Lines changed: 96 additions & 0 deletions b/‎+bert/+layer/block.m‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎+bert/+layer/classifier.m‎
Lines changed: 13 additions & 0 deletions b/‎+bert/+layer/classifier.m‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎+bert/+layer/classifierHead.m‎
Lines changed: 12 additions & 0 deletions b/‎+bert/+layer/classifierHead.m‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎+bert/+layer/embedding.m‎
Lines changed: 21 additions & 0 deletions b/‎+bert/+layer/embedding.m‎
Lines changed: 21 additions & 0 deletions
@@ -0,0 +1,30 @@
+function dirpath = convertModelNameToDirectories(name)
+% convertModelNameToDirectories   Converts the user facing model name to
+% the directory name used by support files.
+
+% Copyright 2021 The MathWorks, Inc.
+arguments
+    name (1,1) string
+end
+modelName = userInputToSupportFileName(name);
+dirpath = {"data","networks","bert",modelName};
+end
+
+function supportfileName = userInputToSupportFileName(name)
+persistent map;
+if isempty(map)
+    names = namesArray();
+    map = containers.Map(names(:,1),names(:,2));
+end
+supportfileName = map(name);
+end
+
+function names = namesArray()
+names = [
+    "base",               "uncased_L12_H768_A12";
+    "multilingual-cased", "multicased_L12_H768_A12";
+    "medium",             "uncased_L8_H512_A8";
+    "small",              "uncased_L4_H512_A8";
+    "mini",               "uncased_L4_H256_A4";
+    "tiny",               "uncased_L2_H128_A2"];
+end
@@ -0,0 +1,116 @@
+function weightsStruct = createParameterStruct(oldWeightsStruct)
+% createParameterStruct   Given the flat struct of BERT model weights, this
+% function parses that into a tree-like struct of weights.
+
+% Copyright 2021 The MathWorks, Inc.
+
+f = fieldnames(oldWeightsStruct);
+for i = 1:numel(f)
+    name = f{i};
+    encoderLayerPrefix = "bert_encoder_layer";
+    embeddingLayerPrefix = "bert_embeddings";
+    poolingLayerPrefix = "bert_pooler_dense";
+    langModPrefix = "cls_predictions";
+    nspPrefix = "cls_seq_relationship_output";
+    genericClassifierPrefix = "classifier_";
+    
+    weight = dlarray(oldWeightsStruct.(name));
+
+    if startsWith(name,encoderLayerPrefix)
+        % BERT transformer layer weights.
+        layerIndex = extractBetween(name,encoderLayerPrefix+"_","_");
+        newLayerIndex = str2double(layerIndex)+1;
+        layerName = encoderLayerPrefix+"_"+layerIndex;
+        shortLayerName = "layer_"+newLayerIndex;
+        paramName = extractAfter(name,layerName+"_");
+        attentionOrFeedforward = iParseAttentionOrFeedforward(paramName);
+        [subParamName,subsubParamName] = iParseAttentionAndFeedforwardParamName(paramName,attentionOrFeedforward);
+        weightsStruct.("encoder_layers").(shortLayerName).(attentionOrFeedforward).(subParamName).(subsubParamName) = weight;
+        
+    elseif startsWith(name,embeddingLayerPrefix)
+        % Emebdding parameters
+        paramName = extractAfter(name,embeddingLayerPrefix+"_");
+        if contains(paramName,"LayerNorm")
+            [subname,subsubname] = iParseLayerNorm(paramName);
+            weightsStruct.("embeddings").(subname).(subsubname) = weight;
+        else
+            weightsStruct.("embeddings").(paramName) = weight;
+        end
+        
+    elseif startsWith(name,poolingLayerPrefix)
+        paramName = extractAfter(name,poolingLayerPrefix+"_");
+        weightsStruct.("pooler").(paramName) = weight;
+        
+    elseif startsWith(name,langModPrefix)
+        paramName = extractAfter(name,langModPrefix+"_");
+        [subname,subsubname] = iParseLM(paramName);
+        weightsStruct.("masked_LM").(subname).(subsubname) = weight;
+        
+    elseif startsWith(name,nspPrefix)
+        paramName = extractAfter(name,nspPrefix+"_");
+        if strcmp(paramName,"weights")
+            % This parameter wasn't renamed and transposed before
+            % uploading. We can fix it here.
+            paramName = "kernel";
+            weight = weight.';
+        end
+        weightsStruct.("sequence_relation").(paramName) = weight;
+        
+    elseif startsWith(name,genericClassifierPrefix)
+        paramName = extractAfter(name,genericClassifierPrefix);
+        weightsStruct.("classifier").(paramName) = weight;
+    end
+end
+end
+
+function name = iParseAttentionOrFeedforward(name)
+if contains(name, "attention")
+    name = "attention";
+else
+    name = "feedforward";
+end
+end
+
+function [name,subname] = iParseAttentionAndFeedforwardParamName(name,attnOrFeedforward)
+switch attnOrFeedforward
+    case "attention"
+        [name,subname] = iParseAttentionParamName(name);
+    case "feedforward"
+        [name,subname] = iParseFeedforwardParamName(name);
+end
+end
+
+function [subname,subsubname] = iParseAttentionParamName(name)
+if contains(name,"LayerNorm")
+    [subname,subsubname] = iParseLayerNorm(name);
+else
+    name = strrep(name,"self_","");
+    name = strrep(name,"_dense","");
+    subname = extractBetween(name,"attention_","_");
+    subsubname = extractAfter(name,subname+"_");
+end
+end
+
+function [subname,subsubname] = iParseFeedforwardParamName(name)
+if contains(name,"LayerNorm")
+    [subname,subsubname] = iParseLayerNorm(name);
+else
+    subname = extractBefore(name,"_");
+    subsubname = extractAfter(name,"dense_");
+end
+end
+
+function [subname,subsubname] = iParseLayerNorm(name)
+subname = "LayerNorm";
+subsubname = extractAfter(name,"LayerNorm_");
+end
+
+function [subname,subsubname] = iParseLM(name)
+if contains(name,"LayerNorm")
+    [subname,subsubname] = iParseLayerNorm(name);
+else
+    name = strrep(name,"dense_","");
+    subname = extractBefore(name,"_");
+    subsubname = extractAfter(name,"_");
+end    
+end
@@ -0,0 +1,33 @@
+function [x,untokenizedPieces,ismask] = encodeWithMaskToken(tok,str)
+% encodeWithMaskToken   This function handles the case of encoding an input
+% string that includes tokens such as [MASK].
+
+% Copyright 2021 The MathWorks, Inc.
+arguments
+    tok bert.tokenizer.BERTTokenizer
+    str (1,:) string
+end
+[seqs,untokenizedPieces] = arrayfun(@(s)encodeScalarString(tok,s),str,'UniformOutput',false);
+x = padsequences(seqs,2,'PaddingValue',tok.PaddingCode);
+maskCode = tok.MaskCode;
+ismask = x==maskCode;
+end
+
+function [x,pieces] = encodeScalarString(tok,str)
+pieces = split(str,tok.MaskToken);
+fulltok = tok.FullTokenizer;
+maskCode = fulltok.encode(tok.MaskToken);
+x = [];
+
+for i = 1:numel(pieces)
+    tokens = fulltok.tokenize(pieces(i));
+    if ~isempty(tokens)
+        % "" tokenizes to empty - awkward
+        x = cat(2,x,fulltok.encode(tokens));
+    end
+    if i<numel(pieces)
+        x = cat(2,x,maskCode);
+    end
+end
+x = [fulltok.encode(tok.StartToken),x,fulltok.encode(tok.SeparatorToken)];
+end
@@ -0,0 +1,19 @@
+function filePath = getSupportFilePath(modelName,fileName)
+% getSupportFilePath   This function is for converting any differences
+% between the model names presented to the user and the support files
+% URLs.
+
+% Copyright 2021 The MathWorks, Inc.
+arguments
+    modelName (1,1) string
+    fileName (1,1) string
+end
+directory = bert.internal.convertModelNameToDirectories(modelName);
+sd = matlab.internal.examples.utils.getSupportFileDir();
+localFile = fullfile(sd,"nnet",directory{:},fileName);
+if exist(localFile,'file')~=2
+    disp("Downloading "+fileName+" to: "+localFile);
+end
+fileURL = strjoin([directory,fileName],"/");
+filePath = matlab.internal.examples.downloadSupportFile("nnet",fileURL);
+end
@@ -0,0 +1,17 @@
+function types = inferTypeID(x,separatorCode)
+% infer the typeIDs from a CTB unlabeled array x
+xsz = size(x);
+types = ones(xsz);
+sepId = x==separatorCode;
+if isa(sepId,'dlarray')
+    sepId = extractdata(sepId);
+end
+% Find which observations have >1 separator - when there is 1 separator,
+% any padding is considered "type 1".
+cs = cumsum(sepId,2);
+obsNeedsType2 = cs(:,end,:)>1;
+% Type 2 tokens are those between the first (exclusive) and second
+% separator (inclusive) if a second separator was present.
+type2positions = circshift(cs==1,1) & obsNeedsType2;
+types(type2positions) = 2;
+end
@@ -0,0 +1,14 @@
+function [toks,probs] = predictMaskedToken(mdl,x,maskIdx,k)
+arguments
+    mdl
+    x
+    maskIdx
+    k (1,1) double {mustBePositive,mustBeInteger} = 1
+end
+probs = bert.languageModel(x,mdl.Parameters);
+probs = extractdata(probs(:,maskIdx));
+[~,idx] = maxk(probs,k);
+toks = mdl.Tokenizer.FullTokenizer.decode(idx);
+end
+    
+    
@@ -0,0 +1,96 @@
+function z = block(z,weights,hyperParameters,nvp)
+% block   Transformer block for BERT
+% 
+%   Z = block(X,weights,hyperParameters) computes the BERT style
+%   transformer block on the input X as described in [1]. Here X is a
+%   (numFeatures*numHeads)-by-numInputSubwords array. The weights and
+%   hyperParameters must be structs in the same format as returned by the
+%   bert() function.
+%
+%   Z = block(X,weights,hyperParameters,'PARAM1',VAL1,'PARAM2',VAL2)
+%   specifies the optional parameter name/value pairs:
+%
+%     'HiddenDropout'    - The dropout probability to be applied between
+%                          the self attention mechanism and the residual
+%                          connection. The default is 0.
+%
+%     'AttentionDropout' - The dropout probability to be applied to the
+%                          attention probabilities. The default is 0.
+%
+%     'InputMask'        - A logical mask to be used in the attention
+%                          mechanism, for example to block attending to
+%                          padding tokens. The default is [], no masking is
+%                          applied.
+%  
+% References:
+% [1] https://arxiv.org/abs/1810.04805
+
+% Copyright 2021 The MathWorks, Inc.
+arguments
+  z
+  weights
+  hyperParameters
+  nvp.HiddenDropout (1,1) double {mustBeNonnegative, mustBeLessThanOrEqual(nvp.HiddenDropout,1)} =  0
+  nvp.AttentionDropout (1,1) double {mustBeNonnegative, mustBeLessThanOrEqual(nvp.AttentionDropout,1)} = 0
+  nvp.InputMask = []
+end  
+z = attention(z,weights.attention,hyperParameters.NumHeads,nvp.AttentionDropout,nvp.HiddenDropout,nvp.InputMask);
+z = ffn(z,weights.feedforward,nvp.HiddenDropout);
+end
+
+function z = attention(z,w,num_heads,attentionDropout,dropout,mask)
+% The self attention part of the transformer layer.
+layer_input = z;
+
+% Get weights
+Q_w = w.query.kernel;
+Q_b = w.query.bias;
+K_w = w.key.kernel;
+K_b = w.key.bias;
+V_w = w.value.kernel;
+V_b = w.value.bias;
+
+% Put weights into format for transformer.layer.attention
+weights.attn_c_attn_w_0 = cat(1,Q_w,K_w,V_w);
+weights.attn_c_attn_b_0 = cat(1,Q_b,K_b,V_b);
+weights.attn_c_proj_w_0 = w.output.kernel;
+weights.attn_c_proj_b_0 = w.output.bias;
+hyperparameters.NumHeads = num_heads;
+z = transformer.layer.attention(z,[],weights,hyperparameters,'CausalMask',false,'Dropout',attentionDropout,'InputMask',mask);
+
+% Dropout
+z = transformer.layer.dropout(z,dropout);
+
+% Residual connection
+z = layer_input+z;
+
+% Layer normalize.
+z = transformer.layer.normalization(z,w.LayerNorm.gamma,w.LayerNorm.beta);
+end
+
+function z = ffn(z,w,dropout)
+% The feed-forward network part of the transformer layer.
+
+% Weights for embedding in higher dimensional space
+int_w = w.intermediate.kernel;
+int_b = w.intermediate.bias;
+
+% Weights for projecting back down to original space
+out_w = w.output.kernel;
+out_b = w.output.bias;
+
+% Create weights struct for multiLayerPerceptron
+weights.mlp_c_fc_w_0 = int_w;
+weights.mlp_c_fc_b_0 = int_b;
+weights.mlp_c_proj_w_0 = out_w;
+weights.mlp_c_proj_b_0 = out_b;
+ffn_out = transformer.layer.multiLayerPerceptron(z,weights);
+
+% Dropout
+ffn_out = transformer.layer.dropout(ffn_out,dropout);
+
+% Layer normalize.
+out_g = w.LayerNorm.gamma;
+out_b = w.LayerNorm.beta;
+z = transformer.layer.normalization(ffn_out+z,out_g,out_b);
+end
@@ -0,0 +1,13 @@
+function y = classifier(x,p)
+% classifier   The standard BERT classifier, a single fullyconnect.
+%
+%   Z = classifier(X,classifierWeights) applies a fullyconnect operation to
+%   the input X with weights classifierWeights.kernel and bias
+%   classifierWeights.bias. The input X must be an unformatted dlarray of
+%   size hiddenSize-by-numObs. The classifierWeights.kernel must be of size
+%   outputSize-by-hiddenSize, and the classifierWeights.bias must be of
+%   size outputSize-by-1.
+
+% Copyright 2021 The MathWorks, Inc.
+y = transformer.layer.convolution1d(x,p.kernel,p.bias);
+end
@@ -0,0 +1,12 @@
+function z = classifierHead(x,poolerWeights,classifierWeights)
+% classifierHead   The standard classification head for a BERT model.
+% 
+%   Z = classifierHead(X,poolerWeights,classifierWeights) applies
+%   bert.layer.pooler and bert.layer.classifier to X with poolerWeights and
+%   classifierWeights respectively. Both poolerWeights and
+%   classifierWeights must be structs with fields 'kernel' and 'bias'.
+
+% Copyright 2021 The MathWorks, Inc.
+z = bert.layer.pooler(x,poolerWeights);
+z = bert.layer.classifier(z,classifierWeights);
+end
@@ -0,0 +1,21 @@
+function  z = embedding(x,types,positions,w,dropout)
+% embedding   The BERT embeddings of encoded tokens, token types and token
+% positions.
+%
+%   Z = embedding(X,types,positions,weights,dropoutProbability) computes 
+%   the embedding of encoded tokens X, token types specified by types, and 
+%   token positions. Inputs X, types and positions are 
+%   1-by-numInputTokens-by-numObs unformatted dlarray-s. The types take
+%   values 1 or 2. The weights input is a struct of embedding weights such
+%   as mdl.Parameters.Weights.embeddings where mdl = bert(). The
+%   dropoutProbability is a scalar double between 0 and 1 corresponding to
+%   the post-embedding dropout probability.
+
+% Copyright 2021 The MathWorks, Inc.
+wordEmbedding = embed(x,w.word_embeddings,'DataFormat','CTB');
+typeEmbedding = embed(types,w.token_type_embeddings,'DataFormat','CTB');
+positionEmbedding = embed(positions,w.position_embeddings,'DataFormat','CTB');
+z = wordEmbedding+typeEmbedding+positionEmbedding;
+z = transformer.layer.normalization(z,w.LayerNorm.gamma,w.LayerNorm.beta);
+z = transformer.layer.dropout(z,dropout);
+end