Skip to content
This repository has been archived by the owner on Jun 23, 2024. It is now read-only.

Commit

Permalink
added total comp
Browse files Browse the repository at this point in the history
  • Loading branch information
kuutsav committed Feb 11, 2022
1 parent 9d3829f commit 5289289
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 56 deletions.
2 changes: 1 addition & 1 deletion data/meta_info.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"totalPosts": 6670, "totalPostsFromIndia": 3592, "lastUpdated": "2022/02/11 14:43:07", "top20Companies": [["Amazon", 294], ["Microsoft", 157], ["Oracle", 109], ["Goldman Sachs", 69], ["Google", 67], ["Paytm", 66], ["Flipkart", 65], ["Walmart", 63], ["Ola", 62], ["Oyo", 52], ["Paypal", 46], ["Wipro", 45], ["Infosys", 41], ["Swiggy", 38], ["Adobe", 37], ["Salesforce", 36], ["Uber", 35], ["Vmware", 35], ["Tcs", 33], ["Arcesium", 31]], "mostOffersInLastMonth": [["Amazon", 18], ["Microsoft", 15], ["Google", 12], ["Walmart", 9], ["Wipro", 9], ["Paytm", 8], ["Oracle", 8], ["Flipkart", 8], ["Cognizant", 8], ["Goldman Sachs", 5]]}
{"totalPosts": 6710, "totalPostsFromIndia": 3617, "totalPostsWithTotalComp": 2153, "lastUpdated": "2022/02/11 17:47:12", "top20Companies": [["Amazon", 294], ["Microsoft", 158], ["Oracle", 110], ["Google", 70], ["Goldman Sachs", 69], ["Flipkart", 67], ["Paytm", 66], ["Walmart", 63], ["Ola", 62], ["Oyo", 52], ["Paypal", 47], ["Wipro", 46], ["Infosys", 41], ["Adobe", 38], ["Swiggy", 38], ["Salesforce", 36], ["Uber", 35], ["Vmware", 35], ["Tcs", 33], ["Arcesium", 31]], "mostOffersInLastMonth": [["Amazon", 18], ["Microsoft", 16], ["Google", 15], ["Flipkart", 10], ["Wipro", 10], ["Oracle", 9], ["Walmart", 9], ["Paytm", 8], ["Cognizant", 8], ["Accenture", 6]]}
5 changes: 3 additions & 2 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ <h1 id="leetComp" class="col">💸 LeetComp 💸</h1>
<th scope="col" style="width:25%">Role</th>
<th scope="col" style="width:10%">Yoe <span class="sortButton" onclick="sortBySalary(this)"
id="cleanYoe"></span></th>
<th scope="col" style="width:14%">Salary (base) <span class="sortButton"
onclick="sortBySalary(this)" id="cleanSalary"></span></th>
<th scope="col" style="width:14%">Salary <span class="sortButton" onclick="sortBySalary(this)"
id="cleanSalary"></span> <span class="sortButton" onclick="sortBySalary(this)"
id="cleanSalaryTotal"></span></th>
<th scope="col" style="width:10%">Date <span class="sortButton" onclick="sortBySalary(this)"
id="date"></span></th>
<th scope="col" style="width:6%">Views <span class="sortButton" onclick="sortBySalary(this)"
Expand Down
6 changes: 3 additions & 3 deletions js/data.js

Large diffs are not rendered by default.

87 changes: 58 additions & 29 deletions js/scripts.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@ var data = [];

// Data ix and key (we dropped the keys to reduce data size and save network cost)
keyMap = {
"id": 0, "title": 1, "voteCount": 2, "viewCount": 3, "date": 4, "company": 5,
"role": 6, "yoe": 7, "salary": 8, "city": 9, "country": 10, "cleanYoe": 11, "cleanSalary": 12,
"yrOrPm": 13, "cleanCompany": 14
"id": 0, "voteCount": 1, "viewCount": 2, "date": 3, "company": 4, "role": 5,
"cleanYoe": 6, "cleanSalary": 7, "yrOrPm": 8, "cleanSalaryTotal": 9, "cleanCompany": 10
}

// Constants
var pageSize = 25;
var pageSize = 20;
var nPages = Math.ceil(data.length / pageSize);

function setFullTimeOrInternship(yrOrPm) {
window.data = [];
for (i = 0; i < allData.length; i++) {
if (allData[i][13] == yrOrPm) {
if (allData[i][keyMap["yrOrPm"]] == yrOrPm) {
window.data.push(allData[i]);
}
}
Expand All @@ -32,27 +31,46 @@ var tableTbodyRef = document.getElementById("postInfo").getElementsByTagName("tb
function getAllBaseSalaries() {
var salaries = [];
for (i = 0; i < data.length; i++) {
salaries.push(data[i][12] / 100000)
salaries.push(data[i][keyMap["cleanSalary"]] / 100000)
}
return salaries;
}

function getAllTotalSalaries() {
var salaries = [];
for (i = 0; i < data.length; i++) {
if (data[i][keyMap["cleanSalaryTotal"]] != -1) {
salaries.push(data[i][keyMap["cleanSalaryTotal"]] / 100000);
}
}
return salaries;
}

function plotSalaryBarChartData() {
salaries = getAllBaseSalaries();
var trace = {
totalSalaries = getAllTotalSalaries();
var trace1 = {
x: salaries,
name: "base",
type: "histogram",
opacity: 0.5,
marker: { color: "green" }
};
var trace2 = {
x: totalSalaries,
name: "total",
type: "histogram",
opacity: 0.5,
marker: { color: "red" }
};
var layout = {
title: { text: "# salaries #", font: { size: 12 } },
height: 400,
margin: { t: 20, l: 0, r: 0 },
yaxis: { automargin: true },
xaxis: { tickprefix: "₹ ", ticksuffix: " lpa" }
};
var salaryBarChart = [trace];
var salaryBarChart = [trace1, trace2];
Plotly.newPlot("salaryBarChart", salaryBarChart, layout);
}
plotSalaryBarChartData();
Expand Down Expand Up @@ -85,20 +103,20 @@ plotTopCompaniesChartData();
function plotSalaryYoeBinsChart() {
var yoeBin1 = []; var yoeBin2 = []; var yoeBin3 = []; var yoeBin4 = []; var yoeBin5 = [];
for (i = 0; i < data.length; i++) {
if (data[i][11] >= 0 && data[i][11] < 1) {
yoeBin1.push(data[i][12]);
if (data[i][keyMap["cleanYoe"]] >= 0 && data[i][keyMap["cleanYoe"]] < 1) {
yoeBin1.push(data[i][keyMap["cleanSalary"]]);
}
else if (data[i][11] >= 1 && data[i][11] < 3) {
yoeBin2.push(data[i][12]);
else if (data[i][keyMap["cleanYoe"]] >= 1 && data[i][keyMap["cleanYoe"]] < 3) {
yoeBin2.push(data[i][keyMap["cleanSalary"]]);
}
else if (data[i][11] >= 3 && data[i][11] < 6) {
yoeBin3.push(data[i][12]);
else if (data[i][keyMap["cleanYoe"]] >= 3 && data[i][keyMap["cleanYoe"]] < 6) {
yoeBin3.push(data[i][keyMap["cleanSalary"]]);
}
else if (data[i][11] >= 6 && data[i][11] < 9) {
yoeBin4.push(data[i][12]);
else if (data[i][keyMap["cleanYoe"]] >= 6 && data[i][keyMap["cleanYoe"]] < 9) {
yoeBin4.push(data[i][keyMap["cleanSalary"]]);
}
else if (data[i][11] >= 9) {
yoeBin5.push(data[i][12]);
else if (data[i][keyMap["cleanYoe"]] >= 9) {
yoeBin5.push(data[i][keyMap["cleanSalary"]]);
}
}
var trace1 = {
Expand Down Expand Up @@ -156,19 +174,29 @@ function getFormattedYoe(yoe) {
}
}

function getFormattedTotalSalary(totalSalary) {
if (totalSalary == -1) {
return "<button class='btn-danger'>n/a</button>";
}
else {
return "₹ " + totalSalary.toLocaleString("en-IN");
}
}

// Add rows to the postInfo table
function updatePostsTableContent(startIndex, endIndex) {
var myHtmlContent = "";
endIndex = Math.min(data.length, endIndex)
for (var i = startIndex; i < endIndex; i++) {
myHtmlContent += "<tr><td>" + data[i][5] + "</td>";
myHtmlContent += "<td>" + data[i][6].toLowerCase() + "</td>";
myHtmlContent += "<td>" + getFormattedYoe(data[i][11]) + "</td>";
myHtmlContent += "<td>₹ " + data[i][12].toLocaleString("en-IN") + "</td>";
myHtmlContent += "<td>" + data[i][4] + "</td>";
myHtmlContent += "<td>" + data[i][3] + "</td>";
myHtmlContent += "<td>" + data[i][2] + "</td>";
myHtmlContent += "<td>" + data[i][0] + "</td></tr>";
myHtmlContent += "<tr><td>" + data[i][keyMap["company"]] + "</td>";
myHtmlContent += "<td>" + data[i][keyMap["role"]].toLowerCase() + "</td>";
myHtmlContent += "<td>" + getFormattedYoe(data[i][keyMap["cleanYoe"]]) + "</td>";
myHtmlContent += "<td>base: ₹ " + data[i][keyMap["cleanSalary"]].toLocaleString("en-IN");
myHtmlContent += "<br>total: " + getFormattedTotalSalary(data[i][keyMap["cleanSalaryTotal"]]) + "</td>";
myHtmlContent += "<td>" + data[i][keyMap["date"]] + "</td>";
myHtmlContent += "<td>" + data[i][keyMap["viewCount"]] + "</td>";
myHtmlContent += "<td>" + data[i][keyMap["voteCount"]] + "</td>";
myHtmlContent += "<td>" + data[i][keyMap["id"]] + "</td></tr>";
}
tableTbodyRef.innerHTML = myHtmlContent;
};
Expand Down Expand Up @@ -249,14 +277,14 @@ function filterSearchIndexes(ixs) {
window.data = [];
if (document.getElementById("fullTimeButton").classList.contains("active")) {
for (i = 0; i < ixs.length; i++) {
if (allData[ixs[i]][13] == "yearly") {
if (allData[ixs[i]][keyMap["yrOrPm"]] == "yearly") {
window.data.push(allData[ixs[i]]);
}
}
}
else if (document.getElementById("internshipButton").classList.contains("active")) {
for (i = 0; i < ixs.length; i++) {
if (allData[ixs[i]][13] == "monthly") {
if (allData[ixs[i]][keyMap["yrOrPm"]] == "monthly") {
window.data.push(allData[ixs[i]]);
}
}
Expand Down Expand Up @@ -311,7 +339,7 @@ function _yoeFilter(e) {
}
window.data = [];
for (i = 0; i < allData.length; i++) {
yoe = parseFloat(allData[i][11]);
yoe = parseFloat(allData[i][keyMap["cleanYoe"]]);
if (yoe >= minYoe && yoe <= maxYoe) {
window.data.push(allData[i]);
}
Expand All @@ -331,6 +359,7 @@ for (i = 0; i < metaInfo["mostOffersInLastMonth"].length; i++) {
// Stats
document.getElementById("stats").innerHTML = "Total Posts: " + metaInfo["totalPosts"]
+ " | Posts from India: " + metaInfo["totalPostsFromIndia"]
+ " | Posts with Total Comp: " + metaInfo["totalPostsWithTotalComp"]
+ " | Last updated: " + metaInfo["lastUpdated"]


Expand Down
80 changes: 59 additions & 21 deletions leetcomp/ner_heuristic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@


BASE_SALARY_RANGE_INDIA = (2_00_000, 100_00_000)
TOTAL_SALARY_RANGE_INDIA = (2_00_000, 200_00_000)
TOTAL_TO_BASE_MAX_RATIO = 2.5
INTERN_SALARY_RANGE_INDIA = (10_000, 2_00_000)

LABEL_SPECIFICATION = {
Expand All @@ -24,7 +26,7 @@
"RE_SALARY": re.compile(r"(salary|base|base pay)\s?[:-]-?\s?(?P<label>[\w\,\₹\$\.\/\-\(\)\`\\u20b9&#8377;\~ ]+)"),
"RE_LOCATION": re.compile(r"location\s?[:-]-?\s?(?P<label>[\w\,\` ]+)"),
"RE_SALARY_TOTAL": re.compile(
r"\ntot?al (1st year\s)?(comp[e|a]nsation|comp|ctc)(\sfor 1st year)?(\s?\(\s?(salary|base).+?\))?(?P<label>.+)"
r"\\ntot?al (1st year\s)?(comp[e|a]nsation|comp|ctc)(\sfor 1st year)?(\s?\(\s?(salary|base).+?\))?(?P<label>.+)"
),
"RE_SALARY_CLEAN_LPA": re.compile(r"(\d{1,3}(\.\d{1,2})?)\s?(lpa|lakh|lac|l)"),
}
Expand Down Expand Up @@ -63,16 +65,17 @@ def _find_matches(regex_pattern: Pattern[str], content: str) -> List[str]:


def _get_info_as_flat_list(
companies: List[str], titles: List[str], yoes: List[str], salaries: List[str], info: Dict[str, Any]
companies: List[str], roles: List[str], yoes: List[str], pays: List[str], pays_t: List[str], info: Dict[str, Any]
) -> List[Dict[str, Any]]:
n_info = min([len(companies), len(titles), len(yoes), len(salaries)])
n_info = min([len(companies), len(roles), len(yoes), len(pays)])
expanded_info = []
for _ in range(n_info):
_info = info.copy()
_info["company"] = companies[0]
_info["role"] = titles[0]
_info["role"] = roles[0]
_info["yoe"] = yoes[0]
_info["salary"] = salaries[0]
_info["salary"] = pays[0]
_info["salaryTotal"] = pays_t[0] if pays_t else ""
expanded_info.append(_info)
return expanded_info

Expand Down Expand Up @@ -133,6 +136,9 @@ def _report(raw_info: List[Dict[str, Any]]) -> None:
logger.info(f"Posts with Location: {len([r for r in raw_info if 'country' in r])}")
logger.info(f"Posts with YOE: {len([r for r in raw_info if r['cleanYoe'] >= 0])}")
logger.info(f"Posts from India: {len([r for r in raw_info if 'country' in r and r['country'] == 'india'])}")
logger.info(
f"Posts with Total Comp: {len([r for r in raw_info if 'cleanSalaryTotal' in r and r['cleanSalaryTotal'] != -1.0])}"
)


def _is_valid_yearly_base_pay_from_india(base_pay: float):
Expand All @@ -143,13 +149,23 @@ def _is_valid_monthly_internship_pay_from_india(base_pay: float):
return base_pay >= INTERN_SALARY_RANGE_INDIA[0] and base_pay <= INTERN_SALARY_RANGE_INDIA[1]


def _is_valid_monthly_total_pay_from_india(base_pay: float):
return base_pay >= TOTAL_SALARY_RANGE_INDIA[0] and base_pay <= TOTAL_SALARY_RANGE_INDIA[1]


def _filter_invalid_salaries(raw_info: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
n_india = 0
n_dropped = 0
filtered_info = []
for r in raw_info:
if "country" in r and r["country"] == "india":
n_india += 1
if "cleanSalaryTotal" in r and r["cleanSalaryTotal"] != -1:
if not _is_valid_monthly_total_pay_from_india(r["cleanSalaryTotal"]):
r["cleanSalaryTotal"] = -1.0
elif r["cleanSalaryTotal"] / r["cleanSalary"] > TOTAL_TO_BASE_MAX_RATIO:
r["cleanSalaryTotal"] = -1.0

if r["yrOrPm"] == "yearly" and not _is_valid_yearly_base_pay_from_india(r["cleanSalary"]):
n_dropped += 1
continue
Expand All @@ -162,6 +178,28 @@ def _filter_invalid_salaries(raw_info: List[Dict[str, Any]]) -> List[Dict[str, A
return filtered_info


def _add_clean_yoe_and_salaries(expanded_info: List[Dict[str, Any]], info: Dict[str, Any], title: str) -> None:
for info in expanded_info:
info["cleanYoe"] = _get_clean_yoe(info["yoe"].lower(), _preprocess_text(title).lower(), info["role"].lower())
if "country" in info and info["country"] == "india":
if "\\n" in info["salary"].replace(",", "").lower():
info["cleanSalary"], info["yrOrPm"] = _get_clean_salary_for_india(
info["salary"].replace(",", "").lower().split("\\n")[0]
)
else:
info["cleanSalary"], info["yrOrPm"] = _get_clean_salary_for_india(
info["salary"].replace(",", "").lower()
)
if info["yrOrPm"] == "yearly":
total_salary, _ = _get_clean_salary_for_india(
info["salaryTotal"].replace(",", "").lower().split("\\n")[0]
)
if info["cleanSalary"] != -1 and total_salary > info["cleanSalary"]:
info["cleanSalaryTotal"] = total_salary
else:
info["cleanSalaryTotal"] = -1


def _get_clean_company_text(company: str) -> str:
return " ".join(re.findall(r"\w+", company.lower()))

Expand All @@ -179,6 +217,14 @@ def _add_clean_companies(raw_info: List[Dict[str, Any]]) -> None:
r["cleanCompany"] = " ".join([txt.capitalize() for txt in clean_company.split(" ")])


def _drop_info(raw_info: List[Dict[str, Any]]) -> None:
for r in raw_info:
try:
del r["title"], r["yoe"], r["salary"], r["salaryTotal"], r["city"], r["country"]
except KeyError:
continue


def _save_raw_info(raw_info: List[Dict[str, Any]]) -> None:
with open("data/posts_info.json", "w") as f:
json.dump(raw_info, f)
Expand All @@ -197,6 +243,9 @@ def _save_meta_info(total_posts: int, raw_info: List[Dict[str, Any]]) -> Dict[st
meta_info = {
"totalPosts": total_posts,
"totalPostsFromIndia": len([r for r in raw_info if "country" in r and r["country"] == "india"]),
"totalPostsWithTotalComp": len(
[r for r in raw_info if "cleanSalaryTotal" in r and r["cleanSalaryTotal"] != -1.0]
),
"lastUpdated": datetime.now().strftime("%Y/%m/%d %H:%M:%S"),
"top20Companies": top_20,
"mostOffersInLastMonth": most_offers,
Expand Down Expand Up @@ -229,39 +278,28 @@ def parse_posts_and_save_tagged_info() -> None:
roles = _find_matches(LABEL_SPECIFICATION["RE_ROLE"], clean_content)
yoes = _find_matches(LABEL_SPECIFICATION["RE_YOE"], clean_content)
salaries = _find_matches(LABEL_SPECIFICATION["RE_SALARY"], clean_content)
total_salaies = _find_matches(LABEL_SPECIFICATION["RE_SALARY_TOTAL"], clean_content)
if companies and roles and yoes and salaries:
expanded_info = _get_info_as_flat_list(companies, roles, yoes, salaries, info)
expanded_info = _get_info_as_flat_list(companies, roles, yoes, salaries, total_salaies, info)
location = _get_clean_location(_preprocess_text(r.title), clean_content)
if location[1]:
for info in expanded_info:
info["city"] = location[0]; info["country"] = location[1]
for info in expanded_info:
info["cleanYoe"] = _get_clean_yoe(
info["yoe"].lower(), _preprocess_text(r.title).lower(), info["role"].lower()
)
if "country" in info and info["country"] == "india":
if "\\n" in info["salary"].replace(",", "").lower():
info["cleanSalary"], info["yrOrPm"] = _get_clean_salary_for_india(
info["salary"].replace(",", "").lower().split("\\n")[0]
)
else:
info["cleanSalary"], info["yrOrPm"] = _get_clean_salary_for_india(
info["salary"].replace(",", "").lower()
)
_add_clean_yoe_and_salaries(expanded_info, info, r.title)
raw_info += expanded_info
else:
n_dropped += 1
# fmt: on

logger.info(f"Total posts: {total_posts}")
logger.info(f"N posts dropped (missing data): {n_dropped}")
_report(raw_info)
raw_info = _filter_invalid_salaries(raw_info)

_add_clean_companies(raw_info)
raw_info = sorted(raw_info, key=lambda x: x["date"], reverse=True)
_save_raw_info(raw_info)
meta_info = _save_meta_info(total_posts, raw_info)
_drop_info(raw_info)
_save_raw_info(raw_info)
_update_data_in_js(raw_info, meta_info)


Expand Down
Binary file modified posts.db
Binary file not shown.

0 comments on commit 5289289

Please sign in to comment.