diff --git a/.bin/file-exts-downloader.py b/.bin/file-exts-downloader.py new file mode 100644 index 00000000..2849f4bb --- /dev/null +++ b/.bin/file-exts-downloader.py @@ -0,0 +1,119 @@ +#!/usr/bin/python3 + +# Open a pull req if any sources are missing/need to be added + +from bs4 import BeautifulSoup +import requests +import time +import re + +MICROSOFT_URL="https://support.microsoft.com/en-us/windows/common-file-name-extensions-in-windows-da4a4430-8e76-89c5-59f7-1cdbbc75cb01" +WIKI_ROOT="https://en.wikipedia.org" +WIKI_URL="https://en.wikipedia.org/wiki/List_of_filename_extensions" +GFG_URL="https://www.geeksforgeeks.org/list-of-file-formats/" + +wiki_urls=[] +exts=[] +tables=[] + +res=requests.get(WIKI_URL).text +wiki_soup=BeautifulSoup(res,"html.parser") + +for i in wiki_soup.findAll('div',{"class":"mw-content-ltr mw-parser-output"})[0].findAll("ul")[2].findAll("li"): + wiki_urls.append(WIKI_ROOT+i.a.get('href')) + +for i in wiki_urls: + res=requests.get(i).text + wiki_soup=BeautifulSoup(res,"html.parser") + + tables+=wiki_soup.findAll("table") + time.sleep(0.5) + +for i in tables: + + if "wikitable" not in i["class"]: + continue + + + for ext in i.tbody.findAll("tr"): + ext=ext.findAll('td') + + if ext==[]: + continue + + ext=re.sub(r"\[.*?\]","",ext[0].text).strip() + + if "," in ext: + for j in ext.split(","): + + if "." in j: + continue + + exts.append(j.strip()) + continue + + if "." in ext: + continue + + exts.append(ext) + +res=requests.get(MICROSOFT_URL).text +microsoft_soup=BeautifulSoup(res,"html.parser") + +microsoft_exts=microsoft_soup.findAll("tbody")[1].findAll('p')[::2] + +for i in microsoft_exts: + + i=i.text + + if "," in i: + i=i.split(",") + for j in i: + exts.append(j.strip().upper()) + continue + + exts.append(i.upper()) + +res=requests.get(GFG_URL).text +gfg_soup=BeautifulSoup(res,"html.parser") + +gfg_exts=gfg_soup.findAll("tbody") + +for i in gfg_exts: + i=i.findAll('th') + for ext in i: + ext=ext.text.strip() + if ext.startswith('.'): + exts.append(ext[1:].upper()) + else: + exts.append(ext.upper()) + +exts=list(dict.fromkeys(exts)) + +open("file-ext.txt","w").write("\n".join(exts)) + +mutated_exts=[] + +for i in exts: + mutated_exts.append(i) + mutated_exts.append(i.lower()) + +mutated_exts=list(dict.fromkeys(mutated_exts)) + +open("file-ext-all-cases.txt","w").write("\n".join(mutated_exts)) + +mutated_exts=[] + +for i in exts: + mutated_exts.append(i.lower()) + +mutated_exts=list(dict.fromkeys(mutated_exts)) +open("file-ext-lower-cases.txt","w").write("\n".join(mutated_exts)) + +mutated_exts=[] + +for i in exts: + mutated_exts.append(i.upper()) + +mutated_exts=list(dict.fromkeys(mutated_exts)) +open("file-ext-upper-cases.txt","w").write("\n".join(mutated_exts))