diff options
| author | msglm <msglm@techchud.xyz> | 2026-05-10 20:32:35 -0500 |
|---|---|---|
| committer | msglm <msglm@techchud.xyz> | 2026-05-10 20:32:35 -0500 |
| commit | 1db411b17407986627bd00b91c6d60dcaf4b1cea (patch) | |
| tree | 57ae19f47f431faeaf691a2c7b0186cf1ebdea62 | |
| parent | a8fd261a82f3c748c3263fe5cffa0eba10b8a552 (diff) | |
| download | getmeajob-0.0.4.tar.gz getmeajob-0.0.4.tar.bz2 getmeajob-0.0.4.zip | |
add jsfirm support0.0.4
| -rw-r--r-- | getmeajob/getmeajob.py | 90 | ||||
| -rw-r--r-- | getmeajob/scrapers/__init__.py | 0 | ||||
| -rw-r--r-- | getmeajob/scrapers/jobspy.py | 17 | ||||
| -rw-r--r-- | getmeajob/scrapers/jsfirm.py | 97 | ||||
| -rw-r--r-- | pyproject.toml | 1 |
5 files changed, 169 insertions, 36 deletions
diff --git a/getmeajob/getmeajob.py b/getmeajob/getmeajob.py index cec1df1..7524637 100644 --- a/getmeajob/getmeajob.py +++ b/getmeajob/getmeajob.py @@ -8,7 +8,9 @@ from jobspy import scrape_jobs import argparse from fp.fp import FreeProxy from jinja2 import Template -#from llama_cpp import Llama +import getmeajob.scrapers.jobspy as jobspyscraper +import getmeajob.scrapers.jsfirm as jsfirmscraper +from llama_cpp import Llama template = Template(""" {# jobs is a list of dicts with the fields you specified #} @@ -89,6 +91,33 @@ template = Template(""" </html> """) +def jobResults(site, listingEntry: dict) -> dict: + match site: + case "indeed": + return jobspyscraper.getJobs(listingEntry) + case "linkedin": + return jobspyscraper.getJobs(listingEntry) + case "zip_recruiter": + return jobspyscraper.getJobs(listingEntry) + case "google": + return jobspyscraper.getJobs(listingEntry) + case "glassdoor": + return jobspyscraper.getJobs(listingEntry) + case "bayt": + return jobspyscraper.getJobs(listingEntry) + case "naukri": + return jobspyscraper.getJobs(listingEntry) + case "bdjobs": + return jobspyscraper.getJobs(listingEntry) + case "jsfirm": + return jsfirmscraper.getJobs(listingEntry) + #case "aviationjobsearch": + # return scrapers.aviationjobsearch.getJobs(listingEntry) + case _: + print("COULD NOT FIND SCRAPER FOR " + site + "!") + exit(1) + + def main(): parser = argparse.ArgumentParser(description='Get Me A Job!\n\nA CLI tool for scraping various sites and getting you a job. Outputs an HTML document.') parser.add_argument('config_file', type=str, help='The location of your getmeajob config file.') @@ -97,45 +126,34 @@ def main(): config = toml.load(open(args.config_file, 'rb')) for joblistingnames in config.keys(): - listingentry = config[joblistingnames] - - jobs = scrape_jobs( - site_name=listingentry["sites"], - search_term=listingentry["search"], - google_search_term=listingentry["search"] + " near " + listingentry["location"], - location=listingentry["location"], - results_wanted=listingentry["results_wanted"], - hours_old=listingentry["hours_old"], - country_indeed=listingentry["country"], - linkedin_fetch_description=True, # gets more info such as description, direct job url (slower) - proxies=[FreeProxy().get()] if listingentry["proxy"] else [] - ) - - jobsdict = jobs.to_dict(orient='records') + listingEntry = config[joblistingnames] + + for site in listingEntry["sites"]: + jobsdict = jobResults(site, listingEntry) - print(f"Found {len(jobs)} jobs") + print(f"Found {len(jobsdict)} jobs") for job in jobsdict: print(job) -# if len(listingentry["automated_questions"]) > 0: -# -# qanda = [] -# -# llm = Llama.from_pretrained(repo_id="Mungert/Qwen3-4B-abliterated-GGUF", -# filename="*Q8_0.gguf", -# verbose=True -# ) -# for question in listingentry["automated_questions"]: -# qanda.append(llm.create_chat_completion( -# messages = [ -# {"role": "system", "content": "You are a summarizer tasked with summarizing job applications. Presented to you are the user's question about a job description and the description. Using only the provided description, answer the question to the best of your ability. If you are incapable of figuring out the answer, inform the user of this."}, -# { -# "role": "user", -# "content": "Job Summary: \n" + jobsdict["description"] + "\n\n Here is my question:\n " + question -# } -# ])) -# -# print(qanda) + if len(listingentry["automated_questions"]) > 0: + + qanda = [] + + llm = Llama.from_pretrained(repo_id="Mungert/Qwen3-4B-abliterated-GGUF", + filename="*Q8_0.gguf", + verbose=True + ) + for question in listingentry["automated_questions"]: + qanda.append(llm.create_chat_completion( + messages = [ + {"role": "system", "content": "You are a summarizer tasked with summarizing job applications. Presented to you are the user's question about a job description and the description. Using only the provided description, answer the question to the best of your ability. If you are incapable of figuring out the answer, inform the user of this."}, + { + "role": "user", + "content": "Job Summary: \n" + jobsdict["description"] + "\n\n Here is my question:\n " + question + } + ])) + + print(qanda) open(args.output_file, "w", encoding="utf-8").write(template.render(jobs=jobsdict)) diff --git a/getmeajob/scrapers/__init__.py b/getmeajob/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/getmeajob/scrapers/__init__.py diff --git a/getmeajob/scrapers/jobspy.py b/getmeajob/scrapers/jobspy.py new file mode 100644 index 0000000..5853baa --- /dev/null +++ b/getmeajob/scrapers/jobspy.py @@ -0,0 +1,17 @@ +from jobspy import scrape_jobs + +def getJobs(listingEntry: dict) -> dict: + jobs = scrape_jobs( + site_name=listingEntry["sites"], + search_term=listingEntry["search"], + google_search_term=listingEntry["search"] + " near " + listingEntry["location"], + location=listingEntry["location"], + results_wanted=listingEntry["results_wanted"], + hours_old=listingEntry["hours_old"], + country_indeed=listingEntry["country"], + linkedin_fetch_description=True, # gets more info such as description, direct job url (slower) + proxies=[FreeProxy().get()] if listingEntry["proxy"] else [] + ) + + return jobs.to_dict(orient='records') + diff --git a/getmeajob/scrapers/jsfirm.py b/getmeajob/scrapers/jsfirm.py new file mode 100644 index 0000000..7547afc --- /dev/null +++ b/getmeajob/scrapers/jsfirm.py @@ -0,0 +1,97 @@ +import argparse +import os +import selenium +import time +import random +import requests +import tempfile +import shutil +import base64 +from time import sleep +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions +from selenium.webdriver.chrome.service import Service +from selenium import webdriver + +def extractLinksFromTbody(tbody): + jobLinks = [] + for row in tbody.find_elements(By.TAG_NAME, "tr"): + + if "pagination-ys" in ((row.get_property("class") or "").split()): + break + tds = row.find_element(By.TAG_NAME, "td") + jobLinks.append(tds.find_elements(By.TAG_NAME, "a")[1].get_property("href")) + return jobLinks + +def gotoNextPage(tbody, pageToGoTo): + try: + nav = tbody.find_element(By.CLASS_NAME, "pagination-ys").find_element(By.TAG_NAME, "tr") + except: + return + + pagelinks = nav.find_elements(By.TAG_NAME, "a") + + for link in pagelinks: + if int(link.text) == pageToGoTo: + link.click() + +def getMaximumPages(tbody, placeToReturnTo, driver): + try: + #Navigate to the final page + nav = tbody.find_element(By.CLASS_NAME, "pagination-ys").find_element(By.TAG_NAME, "tr") + navLinks = nav.find_elements(By.TAG_NAME, "a") + finalPageLink = navLinks[navLinks.length - 1] + finalPageLink.click() + time.sleep(10) + driver.get(placeToReturnTo) + return int(driver.find_element(By.XPATH, """//*[@id="ContentPlaceHolder3_ContentPlaceHolder3_gvJobs"]""").find_element(By.TAG_NAME, "tbody").find_element(By.TAG_NAME, "span").text) + except: + time.sleep(10) + driver.get(placeToReturnTo) + return 1 + + + + +def getJobs(listingEntry: dict): + chrome_options = Options() + driver = webdriver.Chrome() + + url = "https://www.jsfirm.com/AllCategories/" + listingEntry["search"] + "/" + listingEntry["location"] + "/searchquickjobs" + driver.get(url) + driver.implicitly_wait(1.5) + + + jobLinks = [] + curPage = 0 + time.sleep(10) + pageResults = driver.find_element(By.XPATH, """//*[@id="ContentPlaceHolder3_ContentPlaceHolder3_gvJobs"]""").find_element(By.TAG_NAME, "tbody") + + for page in range(curPage, getMaximumPages(pageResults, url, driver)): + pageResults = driver.find_element(By.XPATH, """//*[@id="ContentPlaceHolder3_ContentPlaceHolder3_gvJobs"]""").find_element(By.TAG_NAME, "tbody") + jobLinks = jobLinks + extractLinksFromTbody(pageResults) + + curPage = page + 1 + gotoNextPage(pageResults, curPage) + + #Actually read the jobs + jobsDictContainer = [] + + for job in jobLinks: + driver.get(job) + + jobsDictContainer.append( + { + "job_url": job, + "company": driver.find_element(By.ID, "ContentPlaceHolder2_ContentPlaceHolder2_ucCompanyOverview_lblCompanyName").text, + "company_url": "https://www.jsfirm.com" + driver.find_element(By.ID, "ContentPlaceHolder2_ContentPlaceHolder2_ucCompanyOverview_lblJobsCompany").find_element(By.TAG_NAME, "a").get_property("href"), + "title": driver.find_element(By.XPATH, "/html/body/form[1]/div[5]/div/div[2]/div/div[5]/div[1]/div[1]/div").text, + "location": driver.find_element(By.ID, "ContentPlaceHolder2_ContentPlaceHolder2_hLocation").get_property("value"), + "description": driver.find_element(By.ID, "ContentPlaceHolder2_ContentPlaceHolder2_ltlDescription").text + } + ) + + return jobsDictContainer diff --git a/pyproject.toml b/pyproject.toml index 2b22f7a..7780104 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "JobSpy @ git+https://github.com/speedyapply/JobSpy.git", "llama-cpp-python", "free-proxy", + "selenium", ] [project.scripts] |
