-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathML_Dataset.js
More file actions
185 lines (149 loc) · 6.11 KB
/
Copy pathML_Dataset.js
File metadata and controls
185 lines (149 loc) · 6.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
let ml_data = 1; // 0 means company extraction dataset, 1 means email classifier dataset
function populateDataset() {
let clear = true;
// Open the active spreadsheet
const spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
const queriesSheet = spreadsheet.getSheetByName("Search Queries");
const mlSheet = ml_data == 0 ? spreadsheet.getSheetByName("ML_Dataset_company_extraction") : spreadsheet.getSheetByName("ML_Dataset_email_classifier");
if (!mlSheet) {
SpreadsheetApp.getUi().alert("Ensure the sheets named ${sheetNames[0]} and ${sheetNames[1]} exist.");
return;
}
if (clear) {
mlSheet.clear();
}
const header = ml_data == 0 ? ["email subject", "email body", "company name (expected output)"] : ["email subject", "email body", "binary classification (expected output)"];
let existingRows = mlSheet.getDataRange().getValues();
// Check if the row already exists
const headerExists = existingRows.some(row => row.join() === header.join());
if (!headerExists) {
// Add the new row
mlSheet.appendRow(header);
} else {
console.log("header exists");
}
ml_rows = [];
const queries = ml_data == 0 ? queriesSheet.getRange(2, 2, queriesSheet.getLastRow() - 1).getValues().flat() : [`subject:("applying" OR "applied" OR "application" OR "applies")`];
if (queries.length === 0) {
SpreadsheetApp.getUi().alert("No queries found in 'Search Queries'. Please add some queries.");
return;
}
queries.forEach(query => {
const threads = GmailApp.search(query);
console.time("Going through all threads");
threads.forEach(thread => {
if (thread.getMessageCount() !== 1) return; // Skip threads with multiple emails
const [message] = thread.getMessages(); // Fetch only the first message
ml_rows.push([
message.getSubject(),
message.getBody(),
""
]);
console.log(message.getSubject())
});
console.timeEnd("Going through all threads");
});
ml_rows.forEach(row => {
mlSheet.appendRow(row);
});
// if (ml_data != 0) {
// mlSheet.deleteColumn(3); // delete the column that contains the output
// }
const range = mlSheet.getDataRange();
range.removeDuplicates();
console.time("Cleaning Data");
cleanData();
console.timeEnd("Cleaning Data");
console.time("Resizing sheet");
autoResizeWithMarginAndWrap(mlSheet, 15, 200);
console.timeEnd("Resizing sheet");
}
function autoResizeWithMarginAndWrap(sheet, extraWidth, wrapThreshold) {
if (!sheet) {
console.error(`Sheet with the name "${sheetName}" not found!`);
return;
}
// Get the number of columns
const maxColumns = sheet.getLastColumn();
// Step 1: Auto-resize all columns
sheet.autoResizeColumns(1, maxColumns - 1);
sheet.setColumnWidth(1, 330);
sheet.setColumnWidth(2, 700);
sheet.getDataRange().setWrap(true);
// // Step 2: Adjust column widths and enable wrapping for wide columns
// for (let col = 1; col <= maxColumns; col++) {
// const currentWidth = sheet.getColumnWidth(col);
// // Add extra width to the column
// const newWidth = currentWidth + extraWidth;
// sheet.setColumnWidth(col, newWidth);
// // Enable text wrapping if the column width exceeds the threshold
// if (newWidth > wrapThreshold) {
// const columnRange = sheet.getRange(1, col, sheet.getMaxRows());
// columnRange.setWrap(true);
// }
// }
console.log(`Auto-resized columns with an extra width of ${extraWidth} px and enabled wrapping for columns over ${wrapThreshold} px.`);
}
function cleanData() {
const spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
const mlSheet = ml_data == 0 ? spreadsheet.getSheetByName("ML_Dataset_company_extraction") : spreadsheet.getSheetByName("ML_Dataset_email_classifier");
// Get all data in the sheet
const data = mlSheet.getDataRange().getValues();
const newData = [];
// Check the position of the "email body" column
const headers = data[0];
const hitsIndex = headers.indexOf("email body");
if (hitsIndex === -1) {
SpreadsheetApp.getUi().alert("The 'email body' column is missing. Please ensure your sheet has this column.");
return;
}
for (let i = 1; i < data.length; i++) {
let original = data[i][hitsIndex];
newData.push([cleanCell(original)]);
}
const range = mlSheet.getRange(2, hitsIndex + 1, data.length - 1, 1);
// console.log(range);
range.setValues(newData);
console.log("It worked");
}
function cleanCell(rawHtml) {
// let url = "https://internship-tracker-1095575192028.us-central1.run.app";
let url = "https://internship-tracker-bq0q.onrender.com"
const endpoint = "/clean-email";
url += endpoint;
const payload = JSON.stringify({ email_body: rawHtml });
const options = {
method: "POST",
contentType: "application/json",
payload: payload,
muteHttpExceptions: true, // Ensure we don't throw an error on non-2xx status codes
};
try {
const response = UrlFetchApp.fetch(url, options);
const responseText = response.getContentText(); // Get raw response
Logger.log("Response: " + responseText); // Log the raw response
const data = JSON.parse(responseText); // Try to parse JSON from the response
return data.cleaned_body || ""; // Return cleaned body
} catch (error) {
Logger.log("Error cleaning email: " + error);
return rawHtml; // Return the original raw HTML in case of an error
}
}
function testCleanCell() {
const testCases = [
{ input: "<p>Hello, world!</p>", expected: "Hello, world!" },
{ input: "", expected: "" },
{ input: null, expected: "" },
{ input: "<div>Malformed HTML without closing tags", expected: "Malformed HTML without closing tags" },
];
testCases.forEach((testCase, index) => {
const result = cleanCell(testCase.input);
Logger.log(`Test Case ${index + 1}: ${result === testCase.expected ? "Passed" : "Failed"}`);
});
}
// function cleanCell(rawHtml) {
// // Parse the HTML content to remove tags and retain only the text.
// const tempDoc = HtmlService.createHtmlOutput(rawHtml).getContent();
// const plainText = tempDoc.replace(/<[^>]*>/g, ''); // Remove HTML tags
// return plainText.replace(/\s+/g, ' ').trim(); // Clean up extra spaces
// }