Module maleo.preprocessing
Expand source code Browse git
from ._preprocessing import *
__all__ = ["word_to_number", "get_hashtag", "get_price", "email_to_tag", "date_to_tag",
"phone_to_tag", "slang_to_formal", "emoji_to_word", "emoji_to_tag", "custom_regex"]
Functions
def custom_regex(text: pandas.core.series.Series, pattern: str, val: str) ‑> pandas.core.series.Series
-
Do what you want with customize regex pattern.
Parameters
text
:pd.Series
- Series of text data
Returns
prepro_text
:pd.Series
- Result text from custom regex
Expand source code Browse git
def custom_regex(text: pd.Series, pattern: str, val: str) -> pd.Series: """Do what you want with customize regex pattern. Parameters ---------- text : pd.Series Series of text data Returns ------- prepro_text : pd.Series Result text from custom regex """ prepro_text = text.replace(regex=pattern, value=val) return prepro_text
def date_to_tag(text: pandas.core.series.Series) ‑> pandas.core.series.Series
-
Convert date to
tag. Parameters
text
:pd.Series
- Series of text data
Returns
prepro_text
:pd.Series
- Text with encoded a date
Expand source code Browse git
def date_to_tag(text: pd.Series) -> pd.Series: """Convert date to <DATE> tag. Parameters ---------- text : pd.Series Series of text data Returns ------- prepro_text : pd.Series Text with encoded a date """ ddmmyyyy = r'\b(3[01]|[12][0-9]|0[1-9])/(1[0-2]|0[1-9])/([0-9]{4})\b' yyyymmdd = r'\b([0-9]{4})/(1[0-2]|0[1-9])/(3[01]|[12][0-9]|0[1-9])\b' date = re.compile('|'.join([ddmmyyyy, yyyymmdd])) prepro_text = text.replace(regex=date, value='<DATE>') return prepro_text
def email_to_tag(text: pandas.core.series.Series) ‑> pandas.core.series.Series
-
Convert email to
tag. Parameters
text
:pd.Series
- Series of text data
Returns
prepro_text
:pd.Series
- Text with encoded an email adress
Expand source code Browse git
def email_to_tag(text: pd.Series) -> pd.Series: """Convert email to <EMAIL> tag. Parameters ---------- text : pd.Series Series of text data Returns ------- prepro_text : pd.Series Text with encoded an email adress """ prepro_text = text.replace(regex=r'[\w\.-]+@[\w\.-]+\.\w+', value='<EMAIL>') return prepro_text
def emoji_to_tag(text: pandas.core.series.Series) ‑> pandas.core.series.Series
-
Convert emoji to
tag. Parameters
text
:pd.Series
- Series of text data
Returns
text
:pd.Series
- Text with emoji encoded in tag format
Expand source code Browse git
def emoji_to_tag(text: pd.Series) -> pd.Series: """Convert emoji to <EMOJI> tag. Parameters ---------- text : pd.Series Series of text data Returns ------- text : pd.Series Text with emoji encoded in tag format """ emoji_dict = load_emojis() for emot in emoji_dict: pattern = r'(' + emot + ')' val = "<EMOJI> " text = text.replace(regex=pattern, value=val) return text
def emoji_to_word(text: pandas.core.series.Series) ‑> pandas.core.series.Series
-
Convert emoji to natural language format.
Parameters
text
:pd.Series
- Series of text data
Returns
text
:pd.Series
- Text with emoji in natural language format
Expand source code Browse git
def emoji_to_word(text: pd.Series) -> pd.Series: """Convert emoji to natural language format. Parameters ---------- text : pd.Series Series of text data Returns ------- text : pd.Series Text with emoji in natural language format """ whitespace = " " emoji_dict = load_emojis() for emot in emoji_dict: pattern = r'(' + emot + ')' val = "_".join(emoji_dict[emot].replace( ",", "").replace(":", "").split()) + whitespace text = text.replace(regex=pattern, value=val) return text
def get_hashtag(text: pandas.core.series.Series) ‑> pandas.core.frame.DataFrame
-
Extract hashtag from text.
Parameters
text
:pd.Series
- Series of text data
Returns
prepro_text
:pd.DataFrame
- DataFrame with 2 columns (Text, Hashtag)
Expand source code Browse git
def get_hashtag(text: pd.Series) -> pd.DataFrame: """Extract hashtag from text. Parameters ---------- text : pd.Series Series of text data Returns ------- prepro_text : pd.DataFrame DataFrame with 2 columns (Text, Hashtag) """ list_text, list_hashtag = [], [] for _, value in text.items(): get_hashtag = list(set(part[1:] for part in value.split() if part.startswith('#'))) if get_hashtag: list_text.append(value) list_hashtag.append(get_hashtag) prepro_text = pd.DataFrame({'Text': list_text, 'Hashtag': list_hashtag}) return prepro_text
def get_price(text: pandas.core.series.Series) ‑> pandas.core.frame.DataFrame
-
Extract price from text, currency that supported => ['Rp', 'RP', '$'].
Parameters
text
:pd.Series
- Series of text data
Returns
prepro_text
:pd.DataFrame
- DataFrame with 2 columns (Text, Price)
Expand source code Browse git
def get_price(text: pd.Series) -> pd.DataFrame: """Extract price from text, currency that supported => ['Rp', 'RP', '$']. Parameters ---------- text : pd.Series Series of text data Returns ------- prepro_text : pd.DataFrame DataFrame with 2 columns (Text, Price) """ list_text, list_price = [], [] for _, value in text.items(): price = Price.fromstring(value) if price.currency in ['Rp', 'RP', '$']: list_text.append(value) list_price.append(price) prepro_text = pd.DataFrame({'Text': list_text, 'Price': list_price}) return prepro_text
def phone_to_tag(text: pandas.core.series.Series) ‑> pandas.core.series.Series
-
Convert phone number to
tag. Parameters
text
:pd.Series
- Series of text data
Returns
prepro_text
:pd.Series
- Text with encoded a phone number
Expand source code Browse git
def phone_to_tag(text: pd.Series) -> pd.Series: """Convert phone number to <PHONE> tag. Parameters ---------- text : pd.Series Series of text data Returns ------- prepro_text : pd.Series Text with encoded a phone number """ re_phone_num = r'(\+62\s?|0)(\d{3,4}-?){2}\d{3,4}' prepro_text = text.replace(regex=re_phone_num, value='<PHONE>') return prepro_text
def slang_to_formal(text: pandas.core.series.Series) ‑> pandas.core.series.Series
-
Convert slang or colloquial word to formal word.
Parameters
text
:pd.Series
- Series of text data
Returns
prepro_text
:pd.Series
- Text with formal word
Expand source code Browse git
def slang_to_formal(text: pd.Series) -> pd.Series: """Convert slang or colloquial word to formal word. Parameters ---------- text : pd.Series Series of text data Returns ------- prepro_text : pd.Series Text with formal word """ slang_dict_path = pkg_resources.resource_filename('maleo', 'preprocessing/slang_dict.json') dict_alay = read_json(slang_dict_path) keyword_proc = KeywordProcessor() for word in dict_alay.items(): keyword_proc.add_keyword(word[0], word[1]) prepro_text = text.apply(keyword_proc.replace_keywords) prepro_text = prepro_text.replace(r"\s{2,}", "") prepro_text = prepro_text.str.strip() return prepro_text
def word_to_number(text: pandas.core.series.Series, lang='en') ‑> pandas.core.series.Series
-
Convert numbers written in the natural language to it's equivalent numeric forms in text.
It currently supports cardinal numbers in the following languages - English(en), Hindi(hi), Spanish(es), Russian(ru) and ordinal numbers in English.
SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru']
Parameters
text
:pd.Series
- Series of text data
lang
:str
- Language, Default = 'en'
Returns
prepro_text
:pd.Series
- Text with numbers written in numeric forms
Expand source code Browse git
def word_to_number(text: pd.Series, lang='en') -> pd.Series: """Convert numbers written in the natural language to it's equivalent numeric forms in text. It currently supports cardinal numbers in the following languages - English(en), Hindi(hi), Spanish(es), Russian(ru) and ordinal numbers in English. SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] Parameters ---------- text : pd.Series Series of text data lang : str Language, Default = 'en' Returns ------- prepro_text : pd.Series Text with numbers written in numeric forms """ def cvt(row, lang): return parse(row, language=lang) prepro_text = text.apply(cvt, lang=lang) return prepro_text