Delta-ML · zh794390558 · Jul 7, 2020 · Jul 7, 2020 · Jul 7, 2020
diff --git a/.style.yapf b/.style.yapf
@@ -1,2 +1,2 @@
 [style]
-based_on_style = chromium 
+based_on_style = yapf
diff --git a/delta/data/datasets/atis.py b/delta/data/datasets/atis.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """
 ## References
 
@@ -81,10 +80,12 @@ def download(self) -> bool:
 
   def after_download(self) -> bool:
     try:
-      summary_joint_nlu_data(os.path.join(self.download_dir, "atis.train.pkl"),
-                             os.path.join(self.data_dir, self.train_file))
-      summary_joint_nlu_data(os.path.join(self.download_dir, "atis.test.pkl"),
-                             os.path.join(self.data_dir, self.test_file))
+      summary_joint_nlu_data(
+          os.path.join(self.download_dir, "atis.train.pkl"),
+          os.path.join(self.data_dir, self.train_file))
+      summary_joint_nlu_data(
+          os.path.join(self.download_dir, "atis.test.pkl"),
+          os.path.join(self.data_dir, self.test_file))
     except Exception as e:
 
       logging.warning(traceback.format_exc())

diff --git a/delta/data/datasets/atis2.py b/delta/data/datasets/atis2.py
@@ -35,7 +35,9 @@ def __init__(self, project_dir):
     self.train_download = "origin_data/atis-2.train.w-intent.iob"
     self.dev_download = "origin_data/atis-2.dev.w-intent.iob"
     self.test_download = "origin_data/atis.test.w-intent.iob"
-    self.download_files = [self.train_download, self.dev_download, self.test_download]
+    self.download_files = [
+        self.train_download, self.dev_download, self.test_download
+    ]
     self.config_files = ['atis2_nlu_joint_lstm_crf.yml']
 
   @staticmethod
@@ -57,28 +59,30 @@ def to_standard_format(input_file, output_file):
         text = text.rstrip("EOS")
         text = text.strip()
 
-        out_file.write(intent_label + "\t"
-                       + slots_label + "\t"
-                       + text + "\n")
+        out_file.write(intent_label + "\t" + slots_label + "\t" + text + "\n")
 
   def download(self) -> bool:
-      github_url = "https://github.com/yvchen/JointSLU.git"
-      res = os.system(f'cd {self.download_dir}; git clone {github_url}')
-      if res != 0:
-        return False
-      return True
+    github_url = "https://github.com/yvchen/JointSLU.git"
+    res = os.system(f'cd {self.download_dir}; git clone {github_url}')
+    if res != 0:
+      return False
+    return True
 
   def after_download(self) -> bool:
     try:
-      shutil.move(os.path.join(self.download_dir, "JointSLU/data"),
-                  os.path.join(self.download_dir, "origin_data"))
+      shutil.move(
+          os.path.join(self.download_dir, "JointSLU/data"),
+          os.path.join(self.download_dir, "origin_data"))
       shutil.rmtree(os.path.join(self.download_dir, "JointSLU"))
-      self.to_standard_format(os.path.join(self.download_dir, self.train_download),
-                              os.path.join(self.data_dir, self.train_file))
-      self.to_standard_format(os.path.join(self.download_dir, self.dev_download),
-                              os.path.join(self.data_dir, self.dev_file))
-      self.to_standard_format(os.path.join(self.download_dir, self.test_download),
-                              os.path.join(self.data_dir, self.test_file))
+      self.to_standard_format(
+          os.path.join(self.download_dir, self.train_download),
+          os.path.join(self.data_dir, self.train_file))
+      self.to_standard_format(
+          os.path.join(self.download_dir, self.dev_download),
+          os.path.join(self.data_dir, self.dev_file))
+      self.to_standard_format(
+          os.path.join(self.download_dir, self.test_download),
+          os.path.join(self.data_dir, self.test_file))
     except Exception as e:
       logging.warning(traceback.format_exc())
       return False

diff --git a/delta/data/datasets/base_dataset.py b/delta/data/datasets/base_dataset.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Data set operation class"""
 
 import os

diff --git a/delta/data/datasets/conll_2003.py b/delta/data/datasets/conll_2003.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """
 ## References
 
@@ -70,9 +69,10 @@ def __init__(self, project_dir):
     self.dev_file = "dev.txt"
     self.test_file = "test.txt"
     self.data_files = [self.train_file, self.test_file, self.dev_file]
-    self.config_files = ["conll_2003_seq_label_bert.yml",
-                         "conll_2003_seq_label_elmo.yml",
-                         "conll_2003_seq_label_lstm_crf.yml"]
+    self.config_files = [
+        "conll_2003_seq_label_bert.yml", "conll_2003_seq_label_elmo.yml",
+        "conll_2003_seq_label_lstm_crf.yml"
+    ]
     self.download_files = [self.train_file, self.test_file, self.dev_file]
 
   def download(self) -> bool:
@@ -91,35 +91,38 @@ def download(self) -> bool:
   @staticmethod
   def to_standard_format(input_file, output_file):
 
-      logging.info("Change data format: {}".format(input_file))
-      words, labels = [], []
-      with open(output_file, "w", encoding="utf-8") as output_file:
-        with open(input_file, "r", encoding="utf-8") as file_input:
-          for line in file_input.readlines():
-            word = line.strip().split(' ')[0]
-            label = line.strip().split(' ')[-1]
-            # here we dont do "DOCSTART" check
-            if len(line.strip()) == 0:
-              l = [label for label in labels if not label]
-              w = [word for word in words if not word]
-              assert len(l) == len(w)
-              l, w = ' '.join(l), ' '.join(w)
-              output_file.write(l + "\t" + w + "\n")
-              words, labels = [], []
-            words.append(word)
-            labels.append(label)
-      logging.info("Change data done: {}".format(output_file))
+    logging.info("Change data format: {}".format(input_file))
+    words, labels = [], []
+    with open(output_file, "w", encoding="utf-8") as output_file:
+      with open(input_file, "r", encoding="utf-8") as file_input:
+        for line in file_input.readlines():
+          word = line.strip().split(' ')[0]
+          label = line.strip().split(' ')[-1]
+          # here we dont do "DOCSTART" check
+          if len(line.strip()) == 0:
+            l = [label for label in labels if not label]
+            w = [word for word in words if not word]
+            assert len(l) == len(w)
+            l, w = ' '.join(l), ' '.join(w)
+            output_file.write(l + "\t" + w + "\n")
+            words, labels = [], []
+          words.append(word)
+          labels.append(label)
+    logging.info("Change data done: {}".format(output_file))
 
   def after_download(self) -> bool:
     try:
       download_file = os.path.join(self.download_dir, "yahoo_answers_csv.tgz")
       os.system(f"tar zxvf {download_file}  -C {self.download_dir}")
-      self.to_standard_format(os.path.join(self.download_dir, self.train_file),
-                              os.path.join(self.data_dir, self.train_file))
-      self.to_standard_format(os.path.join(self.download_dir, self.dev_file),
-                              os.path.join(self.data_dir, self.dev_file))
-      self.to_standard_format(os.path.join(self.download_dir, self.test_file),
-                              os.path.join(self.data_dir, self.test_file))
+      self.to_standard_format(
+          os.path.join(self.download_dir, self.train_file),
+          os.path.join(self.data_dir, self.train_file))
+      self.to_standard_format(
+          os.path.join(self.download_dir, self.dev_file),
+          os.path.join(self.data_dir, self.dev_file))
+      self.to_standard_format(
+          os.path.join(self.download_dir, self.test_file),
+          os.path.join(self.data_dir, self.test_file))
     except Exception as e:
       logging.warning(traceback.format_exc())
       return False

diff --git a/delta/data/datasets/mock_text_cls_data.py b/delta/data/datasets/mock_text_cls_data.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """
 ## Data scale introduction
 
@@ -40,61 +39,78 @@ def __init__(self, project_dir):
     samples_split_by_space = ["1\t都 挺好", "0\t我 很 愤怒"]
     samples_split_by_char = ["1\t都挺好", "0\t我很愤怒"]
     samples_chinese_word = ["1\t都挺好", "0\t我很愤怒"]
-    self.samples_dict = {"english": samples_english,
-                    "split_by_line_mark": samples_split_line_mark,
-                    "split_by_space": samples_split_by_space,
-                    "split_by_char": samples_split_by_char,
-                    "chinese_word": samples_chinese_word}
+    self.samples_dict = {
+        "english": samples_english,
+        "split_by_line_mark": samples_split_line_mark,
+        "split_by_space": samples_split_by_space,
+        "split_by_char": samples_split_by_char,
+        "chinese_word": samples_chinese_word
+    }
 
     self.train_file = "train.txt"
     self.dev_file = "dev.txt"
     self.test_file = "test.txt"
     self.text_vocab = "text_vocab.txt"
     files = [self.train_file, self.dev_file, self.test_file]
-    self.data_files = [x.replace("txt", "")+data_type +".txt"
-                       for x in files for data_type in self.samples_dict]
+    self.data_files = [
+        x.replace("txt", "") + data_type + ".txt"
+        for x in files
+        for data_type in self.samples_dict
+    ]
     self.config_files = ['cnn_cls_mock.yml']
     self.download_files = []
 
-    text_vocab_english = ["<unk>\t0", "</s>\t1", "all\t3", "is\t4",
-                          "well\t5", "i\t6", "am\t7", "very\t8"]
-    text_vocab_split_line_mark = ["<unk>\t0", "</s>\t1", "都\t2", "挺好\t3",
-                                  "我\t4", "很\t5", "|\t6", "是的\t7",
-                                  "呀\t8", "超级\t9", "生气\t10"]
-    text_vocab_split_by_space = ["<unk>\t0", "</s>\t1", "都\t2", "挺好\t3",
-                                 "我\t4", "很\t5"]
-    text_vocab_split_by_char = ["<unk>\t0", "</s>\t1", "都\t2", "挺\t3",
-                                "好\t4", "我\t5", "很\t6", "愤\t7", "怒\t8"]
-    text_vocab_chinese_word = ["<unk>\t0", "</s>\t1", "都\t2", "挺好\t3",
-                               "我\t4", "很\t5"]
-    self.text_vocab_dict = {"english": text_vocab_english,
-                       "split_by_line_mark": text_vocab_split_line_mark,
-                       "split_by_space": text_vocab_split_by_space,
-                       "split_by_char": text_vocab_split_by_char,
-                       "chinese_word": text_vocab_chinese_word}
-
+    text_vocab_english = [
+        "<unk>\t0", "</s>\t1", "all\t3", "is\t4", "well\t5", "i\t6", "am\t7",
+        "very\t8"
+    ]
+    text_vocab_split_line_mark = [
+        "<unk>\t0", "</s>\t1", "都\t2", "挺好\t3", "我\t4", "很\t5", "|\t6", "是的\t7",
+        "呀\t8", "超级\t9", "生气\t10"
+    ]
+    text_vocab_split_by_space = [
+        "<unk>\t0", "</s>\t1", "都\t2", "挺好\t3", "我\t4", "很\t5"
+    ]
+    text_vocab_split_by_char = [
+        "<unk>\t0", "</s>\t1", "都\t2", "挺\t3", "好\t4", "我\t5", "很\t6", "愤\t7",
+        "怒\t8"
+    ]
+    text_vocab_chinese_word = [
+        "<unk>\t0", "</s>\t1", "都\t2", "挺好\t3", "我\t4", "很\t5"
+    ]
+    self.text_vocab_dict = {
+        "english": text_vocab_english,
+        "split_by_line_mark": text_vocab_split_line_mark,
+        "split_by_space": text_vocab_split_by_space,
+        "split_by_char": text_vocab_split_by_char,
+        "chinese_word": text_vocab_chinese_word
+    }
 
   def download(self) -> bool:
     return True
 
-
   def after_download(self) -> bool:
     try:
       for data_type in self.samples_dict:
 
         samples = self.samples_dict[data_type]
         text_vocab_list = self.text_vocab_dict[data_type]
 
-        train_file_path = os.path.join(self.data_dir,
-                                       self.train_file.replace("txt", "") + data_type + ".txt")
-        dev_file_path = os.path.join(self.data_dir,
-                                     self.dev_file.replace("txt", "") + data_type + ".txt")
-        test_file_path = os.path.join(self.data_dir,
-                                      self.test_file.replace("txt", "") + data_type + ".txt")
-        text_vocab_file = os.path.join(self.data_dir,
-                                       self.text_vocab.replace("txt", "") + data_type + ".txt")
-
-        mock_data(samples, train_file_path, dev_file_path, test_file_path, text_vocab_file, text_vocab_list)
+        train_file_path = os.path.join(
+            self.data_dir,
+            self.train_file.replace("txt", "") + data_type + ".txt")
+        dev_file_path = os.path.join(
+            self.data_dir,
+            self.dev_file.replace("txt", "") + data_type + ".txt")
+        test_file_path = os.path.join(
+            self.data_dir,
+            self.test_file.replace("txt", "") + data_type + ".txt")
+        text_vocab_file = os.path.join(
+            self.data_dir,
+            self.text_vocab.replace("txt", "") + data_type + ".txt")
+
+        mock_data(samples, train_file_path, dev_file_path, test_file_path,
+                  text_vocab_file, text_vocab_list)
 
     except Exception as e:
       logging.warning(traceback.format_exc())

diff --git a/delta/data/datasets/mock_text_cls_test.py b/delta/data/datasets/mock_text_cls_test.py
@@ -20,7 +20,6 @@
 from delta.data.datasets.mock_text_cls_data import MockTextCLSData
 
 
-
 class MockTextClsDataTest(tf.test.TestCase):
   """mock cls data class for cls task."""
 

diff --git a/delta/data/datasets/mock_text_match_data.py b/delta/data/datasets/mock_text_match_data.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """
 ## Data scale introduction
 
@@ -27,6 +26,7 @@
 from delta.data.datasets.utils import mock_data
 from delta.utils.register import registers
 
+
 @registers.dataset.register('mock_text_match_data')
 class MockTextMatchData(BaseDataSet):
   """mock match data class for match task."""
@@ -42,18 +42,19 @@ def __init__(self, project_dir):
     self.text_vocab = "text_vocab.txt"
 
     # samples with label
-    self.samples = ["0\tHow should I approach forgiveness?\tI got chickenpox as a child.",
-               "1\tI love china。\tI love china very much。"]
-    self.text_vocab_list = ["<unk>\t0", "</s>\t1", "how\t2", "should\t3",
-                       "i\t4", "approach\t5", "forgiveness\t6", "got\t7",
-                       "chickenpox\t8", "as\t9", "a\t10",
-                       "child\t11", "love\t12", "china\t13",
-                       "very\t14", "much\t15"]
+    self.samples = [
+        "0\tHow should I approach forgiveness?\tI got chickenpox as a child.",
+        "1\tI love china。\tI love china very much。"
+    ]
+    self.text_vocab_list = [
+        "<unk>\t0", "</s>\t1", "how\t2", "should\t3", "i\t4", "approach\t5",
+        "forgiveness\t6", "got\t7", "chickenpox\t8", "as\t9", "a\t10",
+        "child\t11", "love\t12", "china\t13", "very\t14", "much\t15"
+    ]
 
   def download(self) -> bool:
     return True
 
-
   def after_download(self) -> bool:
     try:
       train_file_path = os.path.join(self.data_dir, self.train_file)

diff --git a/delta/data/datasets/mock_text_match_test.py b/delta/data/datasets/mock_text_match_test.py
@@ -20,7 +20,6 @@
 from delta.data.datasets.mock_text_match_data import MockTextMatchData
 
 
-
 class MockTextMatchDataTest(tf.test.TestCase):
   """mock data class test for match task."""
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,7 +20,6 @@
		from delta.data.datasets.mock_text_cls_data import MockTextCLSData



		class MockTextClsDataTest(tf.test.TestCase):
		"""mock cls data class for cls task."""

Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,7 +20,6 @@
		from delta.data.datasets.mock_text_match_data import MockTextMatchData



		class MockTextMatchDataTest(tf.test.TestCase):
		"""mock data class test for match task."""

Expand Down