prashanth-chandran
diff --git a/‎OpenNMT/LICENSE.md‎
Lines changed: 19 additions & 0 deletions b/‎OpenNMT/LICENSE.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎OpenNMT/README.md‎
Lines changed: 23 additions & 0 deletions b/‎OpenNMT/README.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎OpenNMT/onmt/Constants.py‎
Lines changed: 10 additions & 0 deletions b/‎OpenNMT/onmt/Constants.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎OpenNMT/onmt/Dataset.py‎
Lines changed: 45 additions & 0 deletions b/‎OpenNMT/onmt/Dataset.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎OpenNMT/onmt/Dict.py‎
Lines changed: 127 additions & 0 deletions b/‎OpenNMT/onmt/Dict.py‎
Lines changed: 127 additions & 0 deletions
@@ -0,0 +1,19 @@
+The MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -0,0 +1,23 @@
+# OpenNMT: Open-Source Neural Machine Translation
+
+This is a [Pytorch](https://github.com/pytorch/pytorch)
+port of [OpenNMT](https://github.com/OpenNMT/OpenNMT),
+an open-source (MIT) neural machine translation system.
+
+<center style="padding: 40px"><img width="70%" src="http://opennmt.github.io/simple-attn.png" /></center>
+
+## Quickstart
+
+OpenNMT consists of three commands:
+
+1) Preprocess the data.
+
+```python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/demo```
+
+2) Train the model.
+
+```python train.py -data data/demo-train.pt -save_model model -cuda```
+
+3) Translate sentences.
+
+TODO
@@ -0,0 +1,10 @@
+
+PAD = 0
+UNK = 1
+BOS = 2
+EOS = 3
+
+PAD_WORD = '<blank>'
+UNK_WORD = '<unk>'
+BOS_WORD = '<s>'
+EOS_WORD = '</s>'
@@ -0,0 +1,45 @@
+import onmt
+from torch.autograd import Variable
+
+
+class Dataset(object):
+ # FIXME: randomize
+ def __init__(self, srcData, tgtData, batchSize, cuda):
+ self.src = srcData['words']
+ self.tgt = tgtData['words']
+ self.cuda = cuda
+ # FIXME
+ # self.srcFeatures = srcData.features
+ # self.tgtFeatures = tgtData.features
+ assert(len(self.src) == len(self.tgt))
+ self.batchSize = batchSize
+ self.numBatches = len(self.src) // batchSize
+
+ def _batchify(self, data, align_right=False):
+ max_length = max(x.size(0) for x in data)
+ out = data[0].new(len(data), max_length).fill_(onmt.Constants.PAD)
+ for i in range(len(data)):
+ data_length = data[i].size(0)
+ offset = max_length - data_length if align_right else 0
+ out[i].narrow(0, offset, data_length).copy_(data[i])
+ return Variable(out)
+
+ def __getitem__(self, index):
+ assert index < self.numBatches, "%d > %d" % (index, self.numBatches)
+ srcBatch = self._batchify(
+ self.src[index*self.batchSize:(index+1)*self.batchSize], align_right=True)
+ tgtBatch = self._batchify(
+ self.tgt[index*self.batchSize:(index+1)*self.batchSize])
+
+ if self.cuda:
+ srcBatch = srcBatch.cuda()
+ tgtBatch = tgtBatch.cuda()
+
+ # FIXME
+ srcBatch = srcBatch.t().contiguous()
+ tgtBatch = tgtBatch.t().contiguous()
+
+ return srcBatch, tgtBatch
+
+ def __len__(self):
+ return self.numBatches
@@ -0,0 +1,127 @@
+import torch
+
+
+class Dict(object):
+ def __init__(self, data=None):
+ self.idxToLabel = {}
+ self.labelToIdx = {}
+ self.frequencies = {}
+
+ # Special entries will not be pruned.
+ self.special = []
+
+ if data is not None:
+ if type(data) == str:
+ self.loadFile(data)
+ else:
+ self.addSpecials(data)
+
+ def size(self):
+ return len(self.idxToLabel)
+
+ # Load entries from a file.
+ def loadFile(self, filename):
+ for line in open(filename):
+ fields = line.split()
+ label = fields[0]
+ idx = int(fields[1])
+ self.add(label, idx)
+
+ # Write entries to a file.
+ def writeFile(self, filename):
+ with open(filename, 'w') as file:
+ for i in range(self.size()):
+ label = self.idxToLabel[i]
+ file.write('%s %d\n' % (label, i))
+
+ file.close()
+
+ def lookup(self, key, default=None):
+ try:
+ return self.labelToIdx[key]
+ except KeyError:
+ return default
+
+ def getLabel(self, idx, default=None):
+ try:
+ return self.idxToLabel[idx]
+ except KeyError:
+ return default
+
+ # Mark this `label` and `idx` as special (i.e. will not be pruned).
+ def addSpecial(self, label, idx=None):
+ idx = self.add(label, idx)
+ self.special += [idx]
+
+ # Mark all labels in `labels` as specials (i.e. will not be pruned).
+ def addSpecials(self, labels):
+ for label in labels:
+ self.addSpecial(label)
+
+ # Add `label` in the dictionary. Use `idx` as its index if given.
+ def add(self, label, idx=None):
+ if idx is not None:
+ self.idxToLabel[idx] = label
+ self.labelToIdx[label] = idx
+ else:
+ if label in self.labelToIdx:
+ idx = self.labelToIdx[label]
+ else:
+ idx = len(self.idxToLabel)
+ self.idxToLabel[idx] = label
+ self.labelToIdx[label] = idx
+
+ if idx not in self.frequencies:
+ self.frequencies[idx] = 1
+ else:
+ self.frequencies[idx] += 1
+
+ return idx
+
+ # Return a new dictionary with the `size` most frequent entries.
+ def prune(self, size):
+ if size >= self.size():
+ return self
+
+ # Only keep the `size` most frequent entries.
+ freq = torch.Tensor(
+ [self.frequencies[i] for i in range(len(self.frequencies))])
+ _, idx = torch.sort(freq, 0, True)
+
+ newDict = Dict()
+
+ # Add special entries in all cases.
+ for i in self.special:
+ newDict.addSpecial(self.idxToLabel[i])
+
+ for i in idx[:size]:
+ newDict.add(self.idxToLabel[i])
+
+ return newDict
+
+ # Convert `labels` to indices. Use `unkWord` if not found.
+ # Optionally insert `bosWord` at the beginning and `eosWord` at the .
+ def convertToIdx(self, labels, unkWord, bosWord=None, eosWord=None):
+ vec = []
+
+ if bosWord is not None:
+ vec += [self.lookup(bosWord)]
+
+ unk = self.lookup(unkWord)
+ vec += [self.lookup(label, default=unk) for label in labels]
+
+ if eosWord is not None:
+ vec += [self.lookup(eosWord)]
+
+ return torch.LongTensor(vec)
+
+ # Convert `idx` to labels. If index `stop` is reached, convert it and return.
+ def convertToLabels(self, idx, stop):
+ labels = []
+
+ for i in idx:
+ labels += [self.getLabel(i)]
+ if i == stop:
+ break
+
+ return labels