--- tools/sortlm.py.orig Mon Jul 29 09:53:28 2013 +++ tools/sortlm.py Thu Sep 23 10:02:37 2021 @@ -1,7 +1,7 @@ #!/usr/bin/python -# Copyright (C) 2011-2013 Daiki Ueno -# Copyright (C) 2011-2013 Red Hat, Inc. +# Copyright (C) 2011-2014 Daiki Ueno +# Copyright (C) 2011-2014 Red Hat, Inc. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -40,10 +40,10 @@ class SortedGenerator(object): self.__min_cost = 0.0 def read(self): - print "reading N-grams" + print("reading N-grams") self.__read_tries() self.__read_ngrams() - print "min cost = %lf" % self.__min_cost + print("min cost = %lf" % self.__min_cost) def __read_tries(self): while True: @@ -58,7 +58,7 @@ class SortedGenerator(object): line = self.__infile.readline() if line == "": break - line = line.strip() + line = line.strip('\n') if line == "": break match = self.__ngram_line_regex.match(line) @@ -89,7 +89,7 @@ class SortedGenerator(object): line = self.__infile.readline() if line == "": break - line = line.strip() + line = line.strip('\n') if line == "": break match = self.__ngram_line_regex.match(line) @@ -125,14 +125,11 @@ class SortedGenerator(object): def quantize(cost, min_cost): return max(0, min(65535, int(cost * 65535 / min_cost))) - def cmp_header(a, b): - return cmp(a[0], b[0]) - - print "writing 1-gram file" + print("writing 1-gram file") unigram_offsets = {} unigram_file = open("%s.1gram" % self.__output_prefix, "wb") offset = 0 - for ids, value in sorted(self.__ngram_entries[0].iteritems()): + for ids, value in sorted(self.__ngram_entries[0].items()): unigram_offsets[ids[0]] = offset s = struct.pack("=HHH", quantize(value[0], self.__min_cost), @@ -143,13 +140,13 @@ class SortedGenerator(object): offset += 1 unigram_file.close() - print "writing 2-gram file" + print("writing 2-gram file") bigram_offsets = {} bigram_file = open("%s.2gram" % self.__output_prefix, "wb") keys = self.__ngram_entries[1].keys() items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys] offset = 0 - for header, ids in sorted(items, cmp=cmp_header): + for header, ids in sorted(items, key=lambda x: x[0]): value = self.__ngram_entries[1][ids] bigram_offsets[ids] = offset s = struct.pack("=HH", @@ -160,11 +157,11 @@ class SortedGenerator(object): bigram_file.close() if len(self.__ngram_entries[2]) > 0: - print "writing 3-gram file" + print("writing 3-gram file") trigram_file = open("%s.3gram" % self.__output_prefix, "wb") keys = self.__ngram_entries[2].keys() items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys] - for header, ids in sorted(items, cmp=cmp_header): + for header, ids in sorted(items, key=lambda x: x[0]): value = self.__ngram_entries[2][ids] s = struct.pack("=H", quantize(value[0], self.__min_cost))