Skip to content

Commit c0aa5bd

Browse files
增加Mr, R8, R52, Ohsumed, 20ng数据集的loader ,pipe (#382)
* 增加Mr, R8, R52, Ohsumed, 20ng数据集的loader ,pipe * "更改_20ng为NG20"
1 parent a5d608c commit c0aa5bd

File tree

20 files changed

+347
-6
lines changed

20 files changed

+347
-6
lines changed

fastNLP/io/loader/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@
5858
"ChnSentiCorpLoader",
5959
"THUCNewsLoader",
6060
"WeiboSenti100kLoader",
61+
"MRLoader",
62+
"R8Loader", "R52Loader", "OhsumedLoader", "NG20Loader",
6163

6264
'ConllLoader',
6365
'Conll2003Loader',
@@ -88,7 +90,8 @@
8890
]
8991
from .classification import CLSBaseLoader, YelpFullLoader, YelpPolarityLoader, AGsNewsLoader, IMDBLoader, \
9092
SSTLoader, SST2Loader, DBPediaLoader, \
91-
ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader
93+
ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader,\
94+
MRLoader, R8Loader, R52Loader, OhsumedLoader, NG20Loader
9295
from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader
9396
from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader
9497
from .coreference import CoReferenceLoader

fastNLP/io/loader/classification.py

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,13 @@
1111
"SST2Loader",
1212
"ChnSentiCorpLoader",
1313
"THUCNewsLoader",
14-
"WeiboSenti100kLoader"
14+
"WeiboSenti100kLoader",
15+
16+
"MRLoader",
17+
"R8Loader",
18+
"R52Loader",
19+
"OhsumedLoader",
20+
"NG20Loader",
1521
]
1622

1723

@@ -512,3 +518,123 @@ def download(self) -> str:
512518
"""
513519
output_dir = self._get_dataset_path('weibo-senti-100k')
514520
return output_dir
521+
522+
class MRLoader(CLSBaseLoader):
523+
def __init__(self):
524+
super(MRLoader, self).__init__()
525+
526+
def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str:
527+
r"""
528+
自动下载数据集
529+
530+
如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。
531+
下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv
532+
533+
:param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。
534+
:param bool re_download: 是否重新下载数据,以重新切分数据。
535+
:return: str, 数据集的目录地址
536+
"""
537+
dataset_name = r'mr'
538+
data_dir = self._get_dataset_path(dataset_name=dataset_name)
539+
data_dir = _split_dev(dataset_name=dataset_name,
540+
data_dir=data_dir,
541+
dev_ratio=dev_ratio,
542+
re_download=re_download,
543+
suffix='csv')
544+
return data_dir
545+
546+
class R8Loader(CLSBaseLoader):
547+
def __init__(self):
548+
super(R8Loader, self).__init__()
549+
550+
def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str:
551+
r"""
552+
自动下载数据集
553+
554+
如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。
555+
下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv
556+
557+
:param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。
558+
:param bool re_download: 是否重新下载数据,以重新切分数据。
559+
:return: str, 数据集的目录地址
560+
"""
561+
dataset_name = r'R8'
562+
data_dir = self._get_dataset_path(dataset_name=dataset_name)
563+
data_dir = _split_dev(dataset_name=dataset_name,
564+
data_dir=data_dir,
565+
dev_ratio=dev_ratio,
566+
re_download=re_download,
567+
suffix='csv')
568+
return data_dir
569+
570+
class R52Loader(CLSBaseLoader):
571+
def __init__(self):
572+
super(R52Loader, self).__init__()
573+
574+
def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str:
575+
r"""
576+
自动下载数据集
577+
578+
如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。
579+
下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv
580+
581+
:param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。
582+
:param bool re_download: 是否重新下载数据,以重新切分数据。
583+
:return: str, 数据集的目录地址
584+
"""
585+
dataset_name = r'R52'
586+
data_dir = self._get_dataset_path(dataset_name=dataset_name)
587+
data_dir = _split_dev(dataset_name=dataset_name,
588+
data_dir=data_dir,
589+
dev_ratio=dev_ratio,
590+
re_download=re_download,
591+
suffix='csv')
592+
return data_dir
593+
594+
class NG20Loader(CLSBaseLoader):
595+
def __init__(self):
596+
super(NG20Loader, self).__init__()
597+
598+
def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str:
599+
r"""
600+
自动下载数据集
601+
602+
如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。
603+
下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv
604+
605+
:param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。
606+
:param bool re_download: 是否重新下载数据,以重新切分数据。
607+
:return: str, 数据集的目录地址
608+
"""
609+
dataset_name = r'20ng'
610+
data_dir = self._get_dataset_path(dataset_name=dataset_name)
611+
data_dir = _split_dev(dataset_name=dataset_name,
612+
data_dir=data_dir,
613+
dev_ratio=dev_ratio,
614+
re_download=re_download,
615+
suffix='csv')
616+
return data_dir
617+
618+
class OhsumedLoader(CLSBaseLoader):
619+
def __init__(self):
620+
super(OhsumedLoader, self).__init__()
621+
622+
def download(self, dev_ratio: float = 0.0, re_download: bool = False) -> str:
623+
r"""
624+
自动下载数据集
625+
626+
如果dev_ratio不等于0,则根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。
627+
下载完成后在output_dir中有train.csv, test.csv, dev.csv三个文件。否则只有train.csv和test.csv
628+
629+
:param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。
630+
:param bool re_download: 是否重新下载数据,以重新切分数据。
631+
:return: str, 数据集的目录地址
632+
"""
633+
dataset_name = r'ohsumed'
634+
data_dir = self._get_dataset_path(dataset_name=dataset_name)
635+
data_dir = _split_dev(dataset_name=dataset_name,
636+
data_dir=data_dir,
637+
dev_ratio=dev_ratio,
638+
re_download=re_download,
639+
suffix='csv')
640+
return data_dir

fastNLP/io/pipe/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"ChnSentiCorpPipe",
2424
"THUCNewsPipe",
2525
"WeiboSenti100kPipe",
26+
"MRPipe", "R52Pipe", "R8Pipe", "OhsumedPipe", "NG20Loader",
2627

2728
"Conll2003NERPipe",
2829
"OntoNotesNERPipe",
@@ -59,7 +60,7 @@
5960
]
6061

6162
from .classification import CLSBasePipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, \
62-
WeiboSenti100kPipe, AGsNewsPipe, DBPediaPipe
63+
WeiboSenti100kPipe, AGsNewsPipe, DBPediaPipe, MRPipe, R8Pipe, R52Pipe, OhsumedPipe, NG20Loader
6364
from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe
6465
from .conll import Conll2003Pipe
6566
from .coreference import CoReferencePipe

fastNLP/io/pipe/classification.py

Lines changed: 117 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
'IMDBPipe',
1212
"ChnSentiCorpPipe",
1313
"THUCNewsPipe",
14-
"WeiboSenti100kPipe"
14+
"WeiboSenti100kPipe",
15+
"MRPipe", "R8Pipe", "R52Pipe", "OhsumedPipe", "NG20Pipe"
1516
]
1617

1718
import re
@@ -28,7 +29,7 @@
2829
from ..data_bundle import DataBundle
2930
from ..loader.classification import ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader
3031
from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader, \
31-
AGsNewsLoader, DBPediaLoader
32+
AGsNewsLoader, DBPediaLoader, MRLoader, R52Loader, R8Loader, OhsumedLoader, NG20Loader
3233
from ...core._logger import logger
3334
from ...core.const import Const
3435
from ...core.dataset import DataSet
@@ -827,3 +828,117 @@ def process_from_file(self, paths=None):
827828
data_bundle = data_loader.load(paths)
828829
data_bundle = self.process(data_bundle)
829830
return data_bundle
831+
832+
class MRPipe(CLSBasePipe):
833+
def __init__(self, lower: bool = False, tokenizer: str = 'spacy'):
834+
r"""
835+
836+
:param bool lower: 是否将words列的数据小写。
837+
:param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。
838+
"""
839+
super().__init__(tokenizer=tokenizer, lang='en')
840+
self.lower = lower
841+
842+
def process_from_file(self, paths=None):
843+
r"""
844+
845+
:param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。
846+
:return: DataBundle
847+
"""
848+
# 读取数据
849+
data_bundle = MRLoader().load(paths)
850+
data_bundle = self.process(data_bundle)
851+
852+
return data_bundle
853+
854+
855+
class R8Pipe(CLSBasePipe):
856+
def __init__(self, lower: bool = False, tokenizer: str = 'spacy'):
857+
r"""
858+
859+
:param bool lower: 是否将words列的数据小写。
860+
:param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。
861+
"""
862+
super().__init__(tokenizer=tokenizer, lang='en')
863+
self.lower = lower
864+
865+
def process_from_file(self, paths=None):
866+
r"""
867+
868+
:param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。
869+
:return: DataBundle
870+
"""
871+
# 读取数据
872+
data_bundle = R8Loader().load(paths)
873+
data_bundle = self.process(data_bundle)
874+
875+
return data_bundle
876+
877+
878+
class R52Pipe(CLSBasePipe):
879+
def __init__(self, lower: bool = False, tokenizer: str = 'spacy'):
880+
r"""
881+
882+
:param bool lower: 是否将words列的数据小写。
883+
:param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。
884+
"""
885+
super().__init__(tokenizer=tokenizer, lang='en')
886+
self.lower = lower
887+
888+
def process_from_file(self, paths=None):
889+
r"""
890+
891+
:param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。
892+
:return: DataBundle
893+
"""
894+
# 读取数据
895+
data_bundle = R52Loader().load(paths)
896+
data_bundle = self.process(data_bundle)
897+
898+
return data_bundle
899+
900+
901+
class OhsumedPipe(CLSBasePipe):
902+
def __init__(self, lower: bool = False, tokenizer: str = 'spacy'):
903+
r"""
904+
905+
:param bool lower: 是否将words列的数据小写。
906+
:param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。
907+
"""
908+
super().__init__(tokenizer=tokenizer, lang='en')
909+
self.lower = lower
910+
911+
def process_from_file(self, paths=None):
912+
r"""
913+
914+
:param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。
915+
:return: DataBundle
916+
"""
917+
# 读取数据
918+
data_bundle = OhsumedLoader().load(paths)
919+
data_bundle = self.process(data_bundle)
920+
921+
return data_bundle
922+
923+
924+
class NG20Pipe(CLSBasePipe):
925+
def __init__(self, lower: bool = False, tokenizer: str = 'spacy'):
926+
r"""
927+
928+
:param bool lower: 是否将words列的数据小写。
929+
:param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。
930+
"""
931+
super().__init__(tokenizer=tokenizer, lang='en')
932+
self.lower = lower
933+
934+
def process_from_file(self, paths=None):
935+
r"""
936+
937+
:param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。
938+
:return: DataBundle
939+
"""
940+
# 读取数据
941+
data_bundle = NG20Loader().load(paths)
942+
data_bundle = self.process(data_bundle)
943+
944+
return data_bundle
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject clarification organization cookamunga tourist bureau lines 14 sorry , san jose based rosicrucian order called r c , n't remember time stand r c ordo rosae crucis , words latin order rose cross sigh , seems loosing long term memory otherwise headquarters san jose pretty decent metaphysical bookstore , interested books son loves run around egyptian museum cheers , kent sandvik newton apple com alink ksand private activities net"
2+
talk.religion.misc,"subject catholic lit nunnally acs harding edu \( john nunnally \) distribution world organization harding university , , ar nntp posting host acs harding edu x news reader vms news 1 reply dlphknob camelot bradley edu 's message 16 apr 93 18 57 20 gmtlines 45 lines 45 dlphknob camelot dlphknob camelot bradley edu writes 1993apr14 476 mtechca maintech com foster mtechca maintech com writes surprised saddened would expect kind behavior evangelical born gospel thumping face 're true christian protestants , always thought catholics behaved better please stoop level e b g f w c protestants , think best way witness strident , intrusive , loud , insulting self righteous \( pleading mode \) please ! i'm begging ! quit confusing religious groups , stop making generalizations ! i'm protestant ! i'm evangelical ! n't believe way way ! i'm creation scientist ! n't think homosexuals hung ! want discuss bible thumpers , would better singling \( making generalizations \) fundamentalists compared actions methodists southern baptists , would think different religions ! sarcasm sure pick correct groups bible thumpers , fundamentalists , southern baptists deserve hasty generalizations prejudicial statements n't pick methodists ! sarcasm please , prejudice thinking people group , please n't write protestants evangelicals ! \( pleading mode \) god wish could get ahold thomas stories n , n tha gb , gb n yvan sasha david cole iv chief research dlphknob camelot bradley edu"
3+
talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject alt sex stories literary critical analysis \) organization cookamunga tourist bureau lines 16 article h7v agate berkeley edu , dzkriz ocf berkeley edu \( dennis kriz \) wrote i'm going try something , perhaps many would thought even possible want begin process initiating literary critical study pornography posted alt sex stories , identify major themes motifs present stories posted opening possibility objective moral evaluation material present dennis , i'm astounded n't know interested even study filth alt sex stories provide cheers , kent sandvik newton apple com alink ksand private activities net"
4+
talk.religion.misc,"anthony landreneau ozonehole com \( anthony landreneau \) subject abortion distribution world organization ozone online operations , inc , dba ozone hole bbs reply anthony landreneau ozonehole com \( anthony landreneau \) lines 21 margoli watson ibm com \( larry margolis \) anthony landreneau ozonehole com lm rape passed , nothing ever take away lm true forcing remain pregnant continues violation lm body another 9 months see unbelievably cruel life violation cruel , killing living solely friend right cold anthony slmr 2 1 's difference orange \? ozone hole bbs private bulletin board service \( 504 \) 891 3142 3 full service nodes usrobotics 16 8k bps 10 gigs 100 , 000 files skydive new orleans ! rime network mail hub 500 usenet newsgroups please route questions inquiries postmaster ozonehole com"
5+
talk.religion.misc,"kevin rotag mi org \( kevin darcy \) subject 2000 years , say christian morality organization , \? \? \? lines 15 article pww spac at1 59 rice edu pww spacsun rice edu \( peter walker \) writes article 1993apr18 rotag mi org , kevin rotag mi org \( kevin darcy \) wrote , one , considered intentionality primary ontological stuff built perceptions , consciousness , thoughts , etc frank means alone seeing intentionality \( values , puts \) underlying human experience , even called objective experiences , measurements natural world , output des chip others us see intellectual masturbation 'll defer greater firsthand knowledge matters kevin"
6+
talk.religion.misc,"bil okcforum osrhe edu \( bill conner \) subject 2000 years , say christian morality nntp posting host okcforum osrhe edu organization okcforum unix users group x newsreader tin version 1 1 pl9 lines 54 mind , say science basis values bit reach science basis observable fact 'd say one chooses observe observation interpreted significance 's given depends great deal values observer science human activity , subject potential distortion human activity myth scientists moral influence ethical concern , knowledge whole pure nature biases scientist , nonsense bill one argue objective values \( moral sense \) one must first start demonstrating morality objective considering meaning word objective doubt ever happen , back original question objective morality \? may unfortunate choice words , almost self contradictory objective sense used means something immutable absolute morality describes behavior group people first term inclusive , second specific concept supposedly described may meaning however god described christians \( instance \) , existence apart independent humankind existence outside frame reference \( reality \) declares thing , necessarily since defined omnipotent , claims believed , least omnipotent relative us god intrinsically self defined reality whatever says objective sense god determines standard conduct , standard objective human beings held accountable conformance standard permitted ignore , substitute relative morality mode conduct , giving term morality nebulous , meaningless sense argued pretending misunderstand standard objective conduct required meet standard therefore objectively determined convenient pretend term morality infinitely , n't mean objective standard n't exist morality come mean little cultural norm , preferred conduct decent people , making seem subjective , derived absolute , objective , standard ironically , objective standard perfect accord true nature \( according christianity least \) , yet condemned contrary human , oppressive severe may due bill much amoral standard , like , 's x"

0 commit comments

Comments
 (0)