Skip to content

format code #219

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .style.yapf
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[style]
based_on_style = chromium
based_on_style = yapf
11 changes: 6 additions & 5 deletions delta/data/datasets/atis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
## References

Expand Down Expand Up @@ -81,10 +80,12 @@ def download(self) -> bool:

def after_download(self) -> bool:
try:
summary_joint_nlu_data(os.path.join(self.download_dir, "atis.train.pkl"),
os.path.join(self.data_dir, self.train_file))
summary_joint_nlu_data(os.path.join(self.download_dir, "atis.test.pkl"),
os.path.join(self.data_dir, self.test_file))
summary_joint_nlu_data(
os.path.join(self.download_dir, "atis.train.pkl"),
os.path.join(self.data_dir, self.train_file))
summary_joint_nlu_data(
os.path.join(self.download_dir, "atis.test.pkl"),
os.path.join(self.data_dir, self.test_file))
except Exception as e:

logging.warning(traceback.format_exc())
Expand Down
38 changes: 21 additions & 17 deletions delta/data/datasets/atis2.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def __init__(self, project_dir):
self.train_download = "origin_data/atis-2.train.w-intent.iob"
self.dev_download = "origin_data/atis-2.dev.w-intent.iob"
self.test_download = "origin_data/atis.test.w-intent.iob"
self.download_files = [self.train_download, self.dev_download, self.test_download]
self.download_files = [
self.train_download, self.dev_download, self.test_download
]
self.config_files = ['atis2_nlu_joint_lstm_crf.yml']

@staticmethod
Expand All @@ -57,28 +59,30 @@ def to_standard_format(input_file, output_file):
text = text.rstrip("EOS")
text = text.strip()

out_file.write(intent_label + "\t"
+ slots_label + "\t"
+ text + "\n")
out_file.write(intent_label + "\t" + slots_label + "\t" + text + "\n")

def download(self) -> bool:
github_url = "https://github.com/yvchen/JointSLU.git"
res = os.system(f'cd {self.download_dir}; git clone {github_url}')
if res != 0:
return False
return True
github_url = "https://github.com/yvchen/JointSLU.git"
res = os.system(f'cd {self.download_dir}; git clone {github_url}')
if res != 0:
return False
return True

def after_download(self) -> bool:
try:
shutil.move(os.path.join(self.download_dir, "JointSLU/data"),
os.path.join(self.download_dir, "origin_data"))
shutil.move(
os.path.join(self.download_dir, "JointSLU/data"),
os.path.join(self.download_dir, "origin_data"))
shutil.rmtree(os.path.join(self.download_dir, "JointSLU"))
self.to_standard_format(os.path.join(self.download_dir, self.train_download),
os.path.join(self.data_dir, self.train_file))
self.to_standard_format(os.path.join(self.download_dir, self.dev_download),
os.path.join(self.data_dir, self.dev_file))
self.to_standard_format(os.path.join(self.download_dir, self.test_download),
os.path.join(self.data_dir, self.test_file))
self.to_standard_format(
os.path.join(self.download_dir, self.train_download),
os.path.join(self.data_dir, self.train_file))
self.to_standard_format(
os.path.join(self.download_dir, self.dev_download),
os.path.join(self.data_dir, self.dev_file))
self.to_standard_format(
os.path.join(self.download_dir, self.test_download),
os.path.join(self.data_dir, self.test_file))
except Exception as e:
logging.warning(traceback.format_exc())
return False
Expand Down
1 change: 0 additions & 1 deletion delta/data/datasets/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Data set operation class"""

import os
Expand Down
59 changes: 31 additions & 28 deletions delta/data/datasets/conll_2003.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
## References

Expand Down Expand Up @@ -70,9 +69,10 @@ def __init__(self, project_dir):
self.dev_file = "dev.txt"
self.test_file = "test.txt"
self.data_files = [self.train_file, self.test_file, self.dev_file]
self.config_files = ["conll_2003_seq_label_bert.yml",
"conll_2003_seq_label_elmo.yml",
"conll_2003_seq_label_lstm_crf.yml"]
self.config_files = [
"conll_2003_seq_label_bert.yml", "conll_2003_seq_label_elmo.yml",
"conll_2003_seq_label_lstm_crf.yml"
]
self.download_files = [self.train_file, self.test_file, self.dev_file]

def download(self) -> bool:
Expand All @@ -91,35 +91,38 @@ def download(self) -> bool:
@staticmethod
def to_standard_format(input_file, output_file):

logging.info("Change data format: {}".format(input_file))
words, labels = [], []
with open(output_file, "w", encoding="utf-8") as output_file:
with open(input_file, "r", encoding="utf-8") as file_input:
for line in file_input.readlines():
word = line.strip().split(' ')[0]
label = line.strip().split(' ')[-1]
# here we dont do "DOCSTART" check
if len(line.strip()) == 0:
l = [label for label in labels if not label]
w = [word for word in words if not word]
assert len(l) == len(w)
l, w = ' '.join(l), ' '.join(w)
output_file.write(l + "\t" + w + "\n")
words, labels = [], []
words.append(word)
labels.append(label)
logging.info("Change data done: {}".format(output_file))
logging.info("Change data format: {}".format(input_file))
words, labels = [], []
with open(output_file, "w", encoding="utf-8") as output_file:
with open(input_file, "r", encoding="utf-8") as file_input:
for line in file_input.readlines():
word = line.strip().split(' ')[0]
label = line.strip().split(' ')[-1]
# here we dont do "DOCSTART" check
if len(line.strip()) == 0:
l = [label for label in labels if not label]
w = [word for word in words if not word]
assert len(l) == len(w)
l, w = ' '.join(l), ' '.join(w)
output_file.write(l + "\t" + w + "\n")
words, labels = [], []
words.append(word)
labels.append(label)
logging.info("Change data done: {}".format(output_file))

def after_download(self) -> bool:
try:
download_file = os.path.join(self.download_dir, "yahoo_answers_csv.tgz")
os.system(f"tar zxvf {download_file} -C {self.download_dir}")
self.to_standard_format(os.path.join(self.download_dir, self.train_file),
os.path.join(self.data_dir, self.train_file))
self.to_standard_format(os.path.join(self.download_dir, self.dev_file),
os.path.join(self.data_dir, self.dev_file))
self.to_standard_format(os.path.join(self.download_dir, self.test_file),
os.path.join(self.data_dir, self.test_file))
self.to_standard_format(
os.path.join(self.download_dir, self.train_file),
os.path.join(self.data_dir, self.train_file))
self.to_standard_format(
os.path.join(self.download_dir, self.dev_file),
os.path.join(self.data_dir, self.dev_file))
self.to_standard_format(
os.path.join(self.download_dir, self.test_file),
os.path.join(self.data_dir, self.test_file))
except Exception as e:
logging.warning(traceback.format_exc())
return False
Expand Down
88 changes: 52 additions & 36 deletions delta/data/datasets/mock_text_cls_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
## Data scale introduction

Expand All @@ -40,61 +39,78 @@ def __init__(self, project_dir):
samples_split_by_space = ["1\t都 挺好", "0\t我 很 愤怒"]
samples_split_by_char = ["1\t都挺好", "0\t我很愤怒"]
samples_chinese_word = ["1\t都挺好", "0\t我很愤怒"]
self.samples_dict = {"english": samples_english,
"split_by_line_mark": samples_split_line_mark,
"split_by_space": samples_split_by_space,
"split_by_char": samples_split_by_char,
"chinese_word": samples_chinese_word}
self.samples_dict = {
"english": samples_english,
"split_by_line_mark": samples_split_line_mark,
"split_by_space": samples_split_by_space,
"split_by_char": samples_split_by_char,
"chinese_word": samples_chinese_word
}

self.train_file = "train.txt"
self.dev_file = "dev.txt"
self.test_file = "test.txt"
self.text_vocab = "text_vocab.txt"
files = [self.train_file, self.dev_file, self.test_file]
self.data_files = [x.replace("txt", "")+data_type +".txt"
for x in files for data_type in self.samples_dict]
self.data_files = [
x.replace("txt", "") + data_type + ".txt"
for x in files
for data_type in self.samples_dict
]
self.config_files = ['cnn_cls_mock.yml']
self.download_files = []

text_vocab_english = ["<unk>\t0", "</s>\t1", "all\t3", "is\t4",
"well\t5", "i\t6", "am\t7", "very\t8"]
text_vocab_split_line_mark = ["<unk>\t0", "</s>\t1", "都\t2", "挺好\t3",
"我\t4", "很\t5", "|\t6", "是的\t7",
"呀\t8", "超级\t9", "生气\t10"]
text_vocab_split_by_space = ["<unk>\t0", "</s>\t1", "都\t2", "挺好\t3",
"我\t4", "很\t5"]
text_vocab_split_by_char = ["<unk>\t0", "</s>\t1", "都\t2", "挺\t3",
"好\t4", "我\t5", "很\t6", "愤\t7", "怒\t8"]
text_vocab_chinese_word = ["<unk>\t0", "</s>\t1", "都\t2", "挺好\t3",
"我\t4", "很\t5"]
self.text_vocab_dict = {"english": text_vocab_english,
"split_by_line_mark": text_vocab_split_line_mark,
"split_by_space": text_vocab_split_by_space,
"split_by_char": text_vocab_split_by_char,
"chinese_word": text_vocab_chinese_word}

text_vocab_english = [
"<unk>\t0", "</s>\t1", "all\t3", "is\t4", "well\t5", "i\t6", "am\t7",
"very\t8"
]
text_vocab_split_line_mark = [
"<unk>\t0", "</s>\t1", "都\t2", "挺好\t3", "我\t4", "很\t5", "|\t6", "是的\t7",
"呀\t8", "超级\t9", "生气\t10"
]
text_vocab_split_by_space = [
"<unk>\t0", "</s>\t1", "都\t2", "挺好\t3", "我\t4", "很\t5"
]
text_vocab_split_by_char = [
"<unk>\t0", "</s>\t1", "都\t2", "挺\t3", "好\t4", "我\t5", "很\t6", "愤\t7",
"怒\t8"
]
text_vocab_chinese_word = [
"<unk>\t0", "</s>\t1", "都\t2", "挺好\t3", "我\t4", "很\t5"
]
self.text_vocab_dict = {
"english": text_vocab_english,
"split_by_line_mark": text_vocab_split_line_mark,
"split_by_space": text_vocab_split_by_space,
"split_by_char": text_vocab_split_by_char,
"chinese_word": text_vocab_chinese_word
}

def download(self) -> bool:
return True


def after_download(self) -> bool:
try:
for data_type in self.samples_dict:

samples = self.samples_dict[data_type]
text_vocab_list = self.text_vocab_dict[data_type]

train_file_path = os.path.join(self.data_dir,
self.train_file.replace("txt", "") + data_type + ".txt")
dev_file_path = os.path.join(self.data_dir,
self.dev_file.replace("txt", "") + data_type + ".txt")
test_file_path = os.path.join(self.data_dir,
self.test_file.replace("txt", "") + data_type + ".txt")
text_vocab_file = os.path.join(self.data_dir,
self.text_vocab.replace("txt", "") + data_type + ".txt")

mock_data(samples, train_file_path, dev_file_path, test_file_path, text_vocab_file, text_vocab_list)
train_file_path = os.path.join(
self.data_dir,
self.train_file.replace("txt", "") + data_type + ".txt")
dev_file_path = os.path.join(
self.data_dir,
self.dev_file.replace("txt", "") + data_type + ".txt")
test_file_path = os.path.join(
self.data_dir,
self.test_file.replace("txt", "") + data_type + ".txt")
text_vocab_file = os.path.join(
self.data_dir,
self.text_vocab.replace("txt", "") + data_type + ".txt")

mock_data(samples, train_file_path, dev_file_path, test_file_path,
text_vocab_file, text_vocab_list)

except Exception as e:
logging.warning(traceback.format_exc())
Expand Down
1 change: 0 additions & 1 deletion delta/data/datasets/mock_text_cls_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from delta.data.datasets.mock_text_cls_data import MockTextCLSData



class MockTextClsDataTest(tf.test.TestCase):
"""mock cls data class for cls task."""

Expand Down
19 changes: 10 additions & 9 deletions delta/data/datasets/mock_text_match_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
## Data scale introduction

Expand All @@ -27,6 +26,7 @@
from delta.data.datasets.utils import mock_data
from delta.utils.register import registers


@registers.dataset.register('mock_text_match_data')
class MockTextMatchData(BaseDataSet):
"""mock match data class for match task."""
Expand All @@ -42,18 +42,19 @@ def __init__(self, project_dir):
self.text_vocab = "text_vocab.txt"

# samples with label
self.samples = ["0\tHow should I approach forgiveness?\tI got chickenpox as a child.",
"1\tI love china。\tI love china very much。"]
self.text_vocab_list = ["<unk>\t0", "</s>\t1", "how\t2", "should\t3",
"i\t4", "approach\t5", "forgiveness\t6", "got\t7",
"chickenpox\t8", "as\t9", "a\t10",
"child\t11", "love\t12", "china\t13",
"very\t14", "much\t15"]
self.samples = [
"0\tHow should I approach forgiveness?\tI got chickenpox as a child.",
"1\tI love china。\tI love china very much。"
]
self.text_vocab_list = [
"<unk>\t0", "</s>\t1", "how\t2", "should\t3", "i\t4", "approach\t5",
"forgiveness\t6", "got\t7", "chickenpox\t8", "as\t9", "a\t10",
"child\t11", "love\t12", "china\t13", "very\t14", "much\t15"
]

def download(self) -> bool:
return True


def after_download(self) -> bool:
try:
train_file_path = os.path.join(self.data_dir, self.train_file)
Expand Down
1 change: 0 additions & 1 deletion delta/data/datasets/mock_text_match_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from delta.data.datasets.mock_text_match_data import MockTextMatchData



class MockTextMatchDataTest(tf.test.TestCase):
"""mock data class test for match task."""

Expand Down
Loading