Izz Hafeez

PACKAGES

				
import re
import pykakasi
import pinyin
import chinese
from hanziconv import HanziConv
from .spiderman import *
from .utils import *
from .sheets import *
from translate import Translator
import jaconv
from hangul_romanize import Transliter
from hangul_romanize.rule import academic
import ast
import random
from tqdm import tqdm

CHINESE FUNCTIONS

Here are the functions I've created to help me learn Mandarin:

                
def tryPinyin(x):
    
    PURPOSE

    Extends the pinyin.get() function to allow for errors.
    
    try: return pinyin.get(x)
    except: return x


def tryTraditional(x):
    
    PURPOSE

    Extends the toTraditional() function to allow for errors.
    
    try: return HanziConv.toTraditional(x)
    except: return x
    

def trySimplified(x):
    
    PURPOSE

    Extends the toSimplified() function to allow for errors.
    
    try: return HanziConv.toSimplified(x)
    except: return x


def tryChineseSentenceOld(x):
    
    PURPOSE

    Outdated way that I search for sentences.
    Basically, you input a chinese word, and it will return sentences containing it.
    I switched out of this because it was not reliable and versatile enough.
    
    try:
        baseUrl = 'http://bcc.blcu.edu.cn/zh/search/0/'
        url = baseUrl + str(x.encode())[2:-1].replace(r'\x','%')
        site = spiderman.website(url)
        site.getRelevant(keywords = [x])
        toReturn = []

        for text in site.relevantText[:5]:
            try: toReturn.append(text.split('全文;')[1].replace(';',''))
            except: pass

        return '\n\n'.join(toReturn)
    except: return ''


def chineseSentence(word, meanings = '', all_ = False):
    
    PURPOSE

    I use the chinesepod.com site to search for sentences based on Chinese words.

    PARAMETERS

    word [str]: self-explanatory
    meanings [str]: Most of the time, each word or character will have multiple meanings
                    However, when searching for a sample sentence, you'd usually want to focus on a particular meaning of the word.
                    This parameter takes care of that.
                    As long as the sentence contains one of the meanings in its English translation, it will pass through the filter.
                    As of now, the words within the parameter are split, and filtered based on character length (3 to 15).
                    I'd go for NLP on this, but I think that's a bit too overkill as I don't really want EVERY single sentence. Just a couple will do.
    all_ [boolean]: If it's True, then all search results will be returned

    OUTPUT

    A list containing the sentence, its pinyin equivalent, and its English meaning.
    
    try:
        site = website('https://chinesepod.com/dictionary/english-chinese/' + word)
        site.getTables()
        meanings = re.findall('[A-Za-z]{3,15}',meanings)
        sentenceSets = site.tables[0][0].tolist()
        sentenceSets = keywordFilter(sentenceSets,meanings)

        if all_:
            toReturn = []
            for sentenceSet in sentenceSets:
                toReturn.append([entry.strip() for entry in re.split('Ǟ+',sentenceSet)[1:4]])
            return toReturn

        else:
            chosenSentenceSet = sentenceSets[random.randrange(0,len(sentenceSets))]
            return [entry.strip() for entry in re.split('Ǟ+',chosenSentenceSet)[1:4]]

    except: return ['','','']


def chineseFull(word, meanings = '', all_ = False):
    
    PURPOSE

    Consolidates the abovementioned functions, to give a more comprehensive understanding of the word.
    It uses the chineseSentence(), the tryPinyin() as well as the tryTraditional() functions.

    PARAMETERS

    word [str]: self-explanatory
    meanings [str]: Most of the time, each word or character will have multiple meanings
                    However, when searching for a sample sentence, you'd usually want to focus on a particular meaning of the word.
                    This parameter takes care of that.
                    As long as the sentence contains one of the meanings in its English translation, it will pass through the filter.
                    As of now, the words within the parameter are split, and filtered based on character length (3 to 15).
                    I'd go for NLP on this, but I think that's a bit too overkill as I don't really want EVERY single sentence. Just a couple will do.
    all_ [boolean]: If it's True, then all search results will be returned

    OUTPUT

    A dictionary containing the word itself, its meanings, its pinyin form, its traditional form, example sentences, the pinyin form of said sentences, and the English translation of said sentences.
    
    if all_:
        Ls = []
        results = chineseSentence(word, meanings, all_ = True)
        for entry in results:
            L = {'Word':word,'Usages':meanings,'Word Pinyin':tryPinyin(word),'Traditional':tryTraditional(word),'Examples':'','Example pinyin':'','Example translation':''}
            L['Examples'], L['Example pinyin'], L['Example translation'] = entry
            Ls.append(L)
        return Ls

    else:
        L = {'Word':word,'Usages':meanings,'Word Pinyin':tryPinyin(word),'Traditional':tryTraditional(word),'Examples':'','Example pinyin':'','Example translation':''}
        results = chineseSentence(word,meanings)
        L['Examples'], L['Example pinyin'], L['Example translation'] = results
        return L


def getHSKVocab(level):
    
    PURPOSE

    hsk.academy has compiled the vocabulary requirements for each of the HSK levels.
    Each word has its own page, coupled with the occasional sample sentences.
    This function extracts that information and compiles everything.

    PARAMETERS

    level [int, [1,6]]: input the HSK level of the vocabulary list you want
    
    site = website('https://hsk.academy/en/hsk_%d' % level)
    soup = site.html('script',string=re.compile('words'))[0]
    words = re.findall('{"id".*?}',soup.string.replace('\\',''))
    words = [ast.literal_eval(x) for x in words]
    df = pd.DataFrame(words)[['hanziRaw','trad','pinyinToneSpace','def']]
    df.columns = ['Word','Traditional','Word Pinyin','Usages']
    df['Level'] = 'HSK %d' % level
    return df


def HSKSentence(word):
    
    PURPOSE

    As mentioned, each word in the HSK list has its own page with sample sentences sometimes.
    This function extracts those sentences.

    PARAMETERS

    word [str]: word in the HSK list which you want to search
    
    try:
        site = website('https://hsk.academy/en/words/%s' % word)
        results = []
        for span in site.html('span'):
            if span.parent.name == 'li':
                results.append(list(span.stripped_strings))
        if results == []:
            return [['','']]
        return results
    except: return [['','']]
    

def fullHSKProcess():
    
    PURPOSE

    This conducts the whole process of getting the HSK words, as well as their sample sentences, using both functions mentioned above.
    It compiles the results into a singular dataframe, with standardised formats.
    
    hsk = pd.concat([getHSKVocab(level) for level in range(1,7)])
    results = []
    for word in tqdm(hsk['Word']):
        results.append(HSKSentence(word))
    hsk['results'] = results
    hsk = hsk.explode('results')
    hsk['Sentence'] = hsk['results'].apply(lambda x: x[0])
    hsk['Sentence Pinyin'] = hsk['results'].apply(lambda x: x[1])
    hsk = hsk.drop('results', axis=1)
    return hsk.reset_index(drop=True)

JAPANESE FUNCTIONS

Here are the functions I've created to help me learn Japanese:

                
def tryRomaji(x):
    
    PURPOSE

    Extends the conv.do() function to allow for errors, converting Japanese text to its romaji form.
    
    try: return conv.do(x).replace(' ha ',' wa ')
    except: return x


def japaneseSentence(word, meanings = '', all_ = False):
    
    PURPOSE

    I use the kanshudo.com site to search for sentences based on Japanese words.

    PARAMETERS

    word [str]: self-explanatory
    meanings [str]: Most of the time, each word or character will have multiple meanings
                    However, when searching for a sample sentence, you'd usually want to focus on a particular meaning of the word.
                    This parameter takes care of that.
                    As long as the sentence contains one of the meanings in its English translation, it will pass through the filter.
                    As of now, the words within the parameter are split, and filtered based on character length (3 to 15).
                    I'd go for NLP on this, but I think that's a bit too overkill as I don't really want EVERY single sentence. Just a couple will do.
    all_ [boolean]: if it's True, then all search results will be returned

    OUTPUT

    A list containing the sentence, its romaji equivalent, and its English meaning.
    
    try:
        site = website('https://www.kanshudo.com/searcht?q=' + word)
        meanings = re.findall('[A-Za-z]{3,15}',meanings)
        sentenceSets = site.html(class_='tatoeba')
        sentenceSets = list(filter(lambda x:re.compile(regOR(meanings)).search(x.get_text()),sentenceSets))

        if all_:
            toReturn = []
            for sentenceSet in sentenceSets:

                for furi in sentenceSet(class_='furigana'):
                    furi.clear()
                
                sentence = re.split('\n+', sentenceSet.get_text())[2]
                transliteration = tryRomaji(sentence)
                translation = re.split('\n+', sentenceSet.get_text())[3].strip()
                toReturn.append([sentence, transliteration, translation])

            return toReturn

        else:
            chosenNumber = random.randrange(0,len(sentenceSets))
            chosenSentenceSet = sentenceSets[chosenNumber]
            
            for furi in chosenSentenceSet(class_='furigana'):
                furi.clear()
            
            sentence = re.split('\n+', chosenSentenceSet.get_text())[2]
            transliteration = tryRomaji(sentence)
            translation = re.split('\n+', chosenSentenceSet.get_text())[3].strip()
            
            return [sentence, transliteration, translation]
    except:
        return ['','','']


def japaneseFull(word, meanings = "", all_ = False):
    
    PURPOSE

    Consolidates the abovementioned functions, to give a more comprehensive understanding of the word.
    It uses the japaneseSentence(), the tryRomaji() as well as the tryPinyin() functions.

    PARAMETERS

    word [str]: self-explanatory
    meanings [str]: Most of the time, each word or character will have multiple meanings
                    However, when searching for a sample sentence, you'd usually want to focus on a particular meaning of the word.
                    This parameter takes care of that.
                    As long as the sentence contains one of the meanings in its English translation, it will pass through the filter.
                    As of now, the words within the parameter are split, and filtered based on character length (3 to 15).
                    I'd go for NLP on this, but I think that's a bit too overkill as I don't really want EVERY single sentence. Just a couple will do.
    all_ [boolean]: if it's True, then all search results will be returned

    OUTPUT

    A dictionary containing the word itself, its meanings, its romaji form, its pinyin form, example sentences, the pinyin form of said sentences, and the English translation of said sentences.
    
    if all_:
        Ls = []
        results = japaneseSentence(word, meanings, all_ = True)
        for entry in results:
            L = {'Word':word,'Usages':meanings,'Transliteration':tryRomaji(word),'Chinese':tryPinyin(word),'Examples':'','English':'','Translation':''}
            L['Examples'], L['English'], L['Translation'] = entry
            Ls.append(L)
        return Ls

    else:
        L = {'Word':word,'Usages':meanings,'Transliteration':tryRomaji(word),'Chinese':tryPinyin(word),'Examples':'','English':'','Translation':''}
        results = japaneseSentence(word,meanings)
        L['Examples'], L['English'], L['Translation'] = results
        return L


def tryFindKanji(x, exact = False):
    
    PURPOSE

    Japanese text has its romaji form, which is its English transliteration.
    Many times, there are overlaps in said romaji form ie multiple Japanese words can have the same English transliteration.
    As such, I wanted to find the overlaps and managed to make this function, powered by jisho.org.
    It takes in a string, such as "jisho" and returns the list of Japanese words that follow the same romaji.
    Setting exact = False will give more leeway to the results, enabling words like "jishou" to also come out.
    
    try:   
        site = website('https://jisho.org/search/' + x)
        results = {}
        for i in site.html('span',class_='text')[1:]:
            word = i.get_text().strip()
            furigana = i.find_previous(class_='furigana').get_text().strip()
            meaning = i.find_next(class_='meaning-meaning').get_text()
            if furigana == x or not exact:
                results[word] = {'furigana':furigana, 'meaning':meaning}
        return results
    except: return {}


def JLPTKanji():
    
    PURPOSE

    JLPTsensei has a very good list of Kanji, ordered by frequency of use.
    This function basically just extracts useful information yet again, and I had to apply alot of standardisation principles in all of this.
    
    links = []
    i = 1
    while i < 12:
        try:
            site = website("https://jlptsensei.com/kanji-list-ordered-by-frequency-of-use/page/{}/".format(i))
            links.extend(list(set(a["href"] for a in site.html("a", href=re.compile("learn-japanese-kanji")))))
            i += 1
        except:
            break
    links = list(set(links))

    result = []

    for link in tqdm(links):

        subsite = website(link)

        try: kpro = allstr(subsite.html(class_="header-kunyomi")[0]," ")
        except: kpro = ""
        try: opro = allstr(subsite.html(class_="header-onyomi")[0]," ")
        except: opro = ""

        krea = []
        try:
            for word in list(subsite.html(text=re.compile("Kunyomi Readings"))[0].next_siblings):
                try:
                    krea.append(tuple(map(lambda x: x.strip(), re.split("【|】",word))))
                except: continue
        except: None            

        orea = []
        try:
            for word in list(subsite.html(text=re.compile("Onyomi Readings"))[0].next_siblings):
                try:
                    orea.append(tuple(map(lambda x: x.strip(), re.split("【|】",word))))
                except: continue
        except: None

        sentences = []
        for h in subsite.html("h5"):
            try:
                jap, *a, romaji, meaning = list(map(allstr, h.next_siblings))
                sentences.append((jap, romaji, meaning))
            except: continue

        try: rank = int(allstr(subsite.html(text="2,500")[0].find_previous("b").find_previous("b")))
        except: rank = 0
        try: meaning = subsite.html(text=re.compile("Meaning: "))[0]
        except: meaning = ""
        try: kanji = subsite.html(text=re.compile("Meaning of"))[0].replace("Meaning of","").strip()
        except: kanji = ""

        for sentence in sentences:
            try:
                result.append({"Rank":rank,
                               "Kanji":kanji,
                               "Word":kanji,
                               "KPro":kpro,
                               "OPro":opro,
                               "Meaning":meaning,
                               "Sentence":sentence[0],
                               "Romaji":sentence[1],
                               "SMeaning":sentence[2]})
            except: None

        for kre in krea:
            try:
                result.append({"Rank":rank,
                               "Kanji":kanji,
                               "Word":kre[0],
                               "KPro":kre[1],
                               "OPro":"",
                               "Meaning":kre[2],
                               "Sentence":"",
                               "Romaji":"",
                               "SMeaning":""})
            except: None

        for ore in orea:
            try:
                result.append({"Rank":rank,
                               "Kanji":kanji,
                               "Word":ore[0],
                               "KPro":ore[1],
                               "OPro":"",
                               "Meaning":ore[2],
                               "Sentence":"",
                               "Romaji":"",
                               "SMeaning":""})
            except: None
    return result


def japaneseLesson(link):
    
    PURPOSE

    Kanshudo has alot of useful Japanese lessons and grammar points, along with useful sentences.
    Based on a particular link from kanshudo, this function extracts alot of useful information and standardises them into sizeable lesson points.
    On its own, it's quite useless.
    So just use the japaneseLessonFull() function to get all the lessons at once, then decide from there what you want.
    
    si = website(link)
    results = []
    for h4 in si.html('h4'):
        word = h4.get_text().split(' - Grammar')[0]
        level = 'N' + link[-1]
        transliteration = tryRomaji(word)
        purpose = 'Grammar'
        usage = h4.find_next('div').get_text()
        sentence = ''
        furigana = ''
        sentenceMeaning = ''
        details = h4.find_next(string='DETAILS')
        for sibling in list(details.parent.parent.next_siblings):
            try:
                text = sibling.get_text()
                try:
                    try: 
                        kanji = sibling.find(class_='f_kanji').get_text()
                        sentence += kanji
                        furi = sibling.find(class_='furigana').get_text()
                        furigana += furi
                        sentence += text.replace(kanji,'').replace(furi,'')
                    except:
                        sentence += text
                        furigana += text
                except:
                    sentence += text
                    furigana += text
            except: sentenceMeaning += sibling.strip()
        furigana = tryRomaji(sentence)
        results.append({'Word':word,
                        'Transliteration':transliteration,
                        'Purposes':purpose,
                        'Usages':usage,
                        'Examples':sentence,
                        'English':furigana,
                        'Translation':sentenceMeaning,
                        'Level':level,
                        'Link':link})
        
    if len(results) == 0:
        word = si.html.find('title').get_text().split(' - Grammar')[0]
        level = 'N' + link[-1]
        transliteration = tryRomaji(word)
        usage = ''
        
        for gpBody in si.html(class_='gp_body'):
            
            content = gpBody.contents
            purpose = 'Grammar'
            sentence = ''
            furigana = ''
            sentenceMeaning = ''
            counter = 1
            
            while(True):
                try: 
                    details = content[counter].find(string='DETAILS')
                    for sibling in list(details.parent.parent.next_siblings):
                        try:
                            text = sibling.get_text()
                            try:
                                try: 
                                    kanji = sibling.find(class_='f_kanji').get_text()
                                    sentence += kanji
                                    furi = sibling.find(class_='furigana').get_text()
                                    furigana += furi
                                    sentence += text.replace(kanji,'').replace(furi,'')
                                except:
                                    sentence += text
                                    furigana += text
                            except:
                                sentence += text
                                furigana += text
                        except: sentenceMeaning += sibling.strip()
                except: break
                counter += 2
            
            if 'DETAILS' not in content[1].get_text():
                usage = content[1].get_text()
                
            else:
                usage = content[1].find(class_='tatoeba').parent.previous_sibling.get_text()
            furigana = tryRomaji(sentence)
            results.append({'Word':word,
                        'Transliteration':transliteration,
                        'Purposes':purpose,
                        'Usages':usage,
                        'Examples':sentence,
                        'English':furigana,
                        'Translation':sentenceMeaning,
                        'Level':level,
                        'Link':link})
        
    return results


def japaneseLessonFull():
    
    PURPOSE

    This function extracts all the lesson points from the 5 levels of the JLPT syllabus from Kanshudo.
    It gets pretty much the grammar points, their explanations, sample sentences etc.
    
    masterResults = []

    for i in range(1,6):
        pageNum = 1
        while(True):
            print('JLPT N%d\nPage Number: %d' % (i,pageNum))
            try:
                link = 'https://www.kanshudo.com/searchg?page=' + str(pageNum) + '&q=jlpt:' + str(i)
                sit = website(link)
                hrefs = sit.html(href=re.compile(r'oq=jlpt:\d$'))

                if len(hrefs) == 0:
                    break

                for href in tqdm(hrefs):
                    try: masterResults.extend(japaneseLesson('https://www.kanshudo.com' + href['href']))
                    except: continue

            except Exception as e:
                print(e)
                break
            pageNum += 1
    return masterResults

KOREAN FUNCTIONS

Here are the functions I've created to help me learn Korean:

                
def tryKoreanRomanizer(x):
    
    PURPOSE

    Extends the transliter.translit() function to allow for errors, converting Korean text to its romaji form.
    
    try: return transliter.translit(x)
    except: return x


def tryHanja(x,tolerance=2):
    
    PURPOSE

    Due to overlaps in language history, Korean words sometimes have Chinese origin.
    So as to make the learning experience easier, I've decided to map as many Korean words as I can into their Chinese form.
    I used the koreanhanja.app site to do my searching.
    It is also to be noted that inputting certain Korean words like "가" would yield a plethora of results.
    As such, I added the tolerance meter to only accept inputs whose lengths are more than the tolerance.
    
    try:
        if len(x) >= tolerance:
            url = 'https://koreanhanja.app/%' + str(x.encode()).replace(r'\x','%')[3:-1]
            splitted = [i.getText() for i in website(url).html('a')]
            splitted = [trySimplified(i.strip()) for i in splitted]
            clean_splitted = list(filter(lambda y: re.search('[\u4e00-\u9fff]+',y),splitted))
            min_length = min([len(i.strip()) for i in clean_splitted])
            short_splitted = ', '.join(list(set(filter(lambda y: len(y) == min_length,clean_splitted))))
            return short_splitted
        else: return ''
    except: return ''


def koreanLesson(link):
    
    PURPOSE

    howtostudykorean.com has a good collection of lessons laid out.
    As such, I decided to extract them out.
    This was much easier said than done, because there was alot of unstructured content in the site.
    There was alot of pattern-finding and regex searches to be done.
    Similar to japaneseLesson(), this function is useless on its own, just necessary for the koreanLessonFull function.
    
    site = website(link)
    title = re.findall(r'Lesson \d+',site.html(class_='titlebar-title')[0].get_text())[0]
    words = {}
    for collapse in site.html(class_='collapseomatic'):
        word = collapse.find_previous('a').get_text()
        words.update({word:{}})
        try:
            words[word]['Meaning'] = collapse.get_text()
        except: continue
        try:
            maybe_common = collapse.find_next(string=re.compile('Common'))
            if maybe_common.find_next('a') == collapse.find_next('a'):
                words[word]['Common'] = maybe_common.parent.get_text().split('\n')[1:]
        except: continue
        try:
            maybe_examples = collapse.find_next(string=re.compile('Examples'))
            if maybe_examples.find_next('a') == collapse.find_next('a'):
                words[word]['Examples'] = [i.replace('\n',' ') for i in re.split('\n(?!=)',re.split('\n',maybe_examples.parent.get_text(),maxsplit=1)[1])]
        except: continue

    if len(words) == 0:
        for i in site.html(string=re.compile('^ = [a-z]')):
            words.update({i.find_previous('a').get_text():{'Meaning':i.split('=')[1].strip()}})
        
    if len(words) == 0:
        for i in re.findall('(.* = .{1,40})\n',site.html.get_text()):
            words.update({i.split(' = ')[0]:i.split(' = ')[1]})

    contents = {}
    hrefs = [x['href'][1:] for x in site.html(href=re.compile(r'^#\d'))]
    for href in hrefs:
        try:
            topic = site.html(attrs = {'name':href})[0].find_next('p',string=re.compile(r'\w'))
            topicText = topic.get_text()
            contents.update({topicText:{}})
            contents[topicText]['Link'] = link + '#' + href
            try:
                sentence = topic.find_next('a')
                sentenceText = sentence.get_text()
                contents[topicText]['Sentence'] = sentenceText
                meaningText = sentence.find_next('p').get_text()
                contents[topicText]['Meaning'] = meaningText
            except: continue
            numColon = 0
            intro = ''
            sentences = []
            currentTag = topic.find_next('p')

            while numColon == 0:
                try:
                    if ':' in currentTag.get_text():
                        numColon += 1
                        if '.' in currentTag.get_text():
                            intro += ' ' + currentTag.get_text()
                    else: intro += ' ' + currentTag.get_text()
                    currentTag = currentTag.find_next('p')
                except: break

            contents[topicText]['Intro'] = '.'.join(re.split(r'\.[^\.]',intro)[:5])
        except Exception as e: print(e)
    return {title:{'Link':link,'Words':words,'Contents':contents}}


def koreanLessonFull():
    
    PURPOSE

    This function extracts all the lesson info from the howtostudykorean.com lesson plan.
    It gives the links to the lessons, the new vocabulary contained within them, as well as the contents for the lessons themselves.
    
    site = website('https://www.howtostudykorean.com/other-stuff/lesson-list/')
    links = []
    for i in site.html(string=re.compile('Lesson ')):
        try: 
            link = i.find_previous('a')['href']
            if 'other-stuff' not in link:
                links.append(link)
        except: continue

    lessons = {}
    for link in tqdm(links):
        try:lessons.update(getLesson(link))
        except Exception as e:
            print(link)
            print(e)
    return lessons