david-thrower
diff --git a/‎phishing_email_detection_gpt2.py‎
Lines changed: 44 additions & 49 deletions b/‎phishing_email_detection_gpt2.py‎
Lines changed: 44 additions & 49 deletions
@@ -190,67 +190,62 @@ def from_config(cls, config):
 
 ### Cerebros model:
 
+from transformers import AutoTokenizer
+import tensorflow as tf
+
 class NewTokenizerLayer(tf.keras.layers.Layer):
- """
- A Keras layer that tokenizes input text using a specified tokenizer.
- """
  def __init__(self, max_seq_length, tokenizer_checkpoint, **kwargs):
- """
- Initializes the NewTokenizerLayer.
- Args:
- - max_seq_length (int): The maximum sequence length for tokenization.
- - tokenizer_checkpoint (str): The checkpoint for the tokenizer to use.
- - **kwargs: Additional keyword arguments for the layer.
- """
- super(NewTokenizerLayer, self).__init__(**kwargs)
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
+ super().__init__(**kwargs)
  self.max_seq_length = max_seq_length
+ self.tokenizer_checkpoint = tokenizer_checkpoint
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
+ 
+ # Ensure tokenizer has a padding token
+ if self.tokenizer.pad_token is None:
+ self.tokenizer.pad_token = self.tokenizer.eos_token
+
  def call(self, inputs):
- """
- Tokenizes the input text.
- Args:
- - inputs: The input text to tokenize.
- Returns:
- - The tokenized input IDs.
- """
- # Check if inputs is a tensor
- # if isinstance(inputs, tf.Tensor):
- # # Convert tensor to a list of strings
- # inputs = inputs.numpy().astype("U").tolist()
-
- # inputs = [x.decode('utf-8') for x in inputs]
- # inputs = tf.strings.unicode_encode(inputs, 'UTF-8')
+ def tokenize_py_fn(inputs):
+ # Convert TensorFlow bytes to Python strings
+ texts = [text.decode('utf-8') for text in inputs.numpy()]
+ 
+ # Tokenize with Hugging Face tokenizer
+ tokenized = self.tokenizer(
+ texts,
+ max_length=self.max_seq_length,
+ padding='max_length',
+ truncation=True,
+ return_tensors='tf'
+ )
+ return tokenized['input_ids'].numpy()
+ 
+ # Wrap Python function in TensorFlow operation
+ input_ids = tf.py_function(
+ tokenize_py_fn,
+ [inputs],
+ Tout=tf.int32
+ )
+ 
+ # Set shape for downstream layers
+ batch_size = tf.shape(inputs)[0]
+ input_ids.set_shape([None, self.max_seq_length])
 
- tokenized = self.tokenizer(inputs.numpy().astype("U").tolist(),
- max_length=self.max_seq_length,
- padding='max_length',
- truncation=True,
- return_tensors='tf',
- return_overflowing_tokens=False)
- # Return the tokenized input IDs
- return tokenized['input_ids']
+ return input_ids
+
  def get_config(self):
- """
- Returns the configuration for the layer.
- Returns:
- - A dictionary containing the layer's configuration.
- """
- config = super(NewTokenizerLayer, self).get_config()
+ config = super().get_config()
  config.update({
  'max_seq_length': self.max_seq_length,
- 'tokenizer_checkpoint': self.tokenizer.name_or_path
+ 'tokenizer_checkpoint': self.tokenizer_checkpoint
  })
  return config
+
  @classmethod
  def from_config(cls, config):
- """
- Creates a new instance of the layer from a configuration.
- Args:
- - config: The configuration dictionary.
- Returns:
- - A new instance of the layer.
- """
- return cls(max_seq_length=config['max_seq_length'], tokenizer_checkpoint=config['tokenizer_checkpoint'])
+ return cls(
+ max_seq_length=config['max_seq_length'],
+ tokenizer_checkpoint=config['tokenizer_checkpoint']
+ )