You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

app.js 6.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. const config = require('config');
  2. const axios = require("axios");
  3. const cheerio = require("cheerio");
  4. const express = require('express');
  5. const cors = require('cors');
  6. const MongoClient = require('mongodb').MongoClient;
  7. const ObjectID = require('mongodb').ObjectID;
  8. var apartments = require('./apartments.js');
  9. var houses = require('./houses.js');
  10. // jobs
  11. var mongoUrl = config.get("mongo");
  12. var agendaDb = config.get("agenda");
  13. const Agenda = require('agenda').Agenda;
  14. const agenda = new Agenda({ db: { address: agendaDb } });
  15. agenda.define('scrape', async function (job, done) {
  16. const { _id } = job.attrs.data;
  17. try {
  18. const dbo = client.db(database);
  19. let collection = dbo.collection('scrapes');
  20. let scrape = await collection.findOne({ _id: _id });
  21. // for (var page = 1; page <= scrape.pageCount; page++) {
  22. for (var page = 1; page <= 1; page++) {
  23. console.log("scrapping page " + page)
  24. const filterPage = await axios(scrape.sourceUrl + `/${page}`);
  25. const html = filterPage.data;
  26. const $ = cheerio.load(html);
  27. const propertyLinks = $('#placardContainer .property-link').map(function () {
  28. return $(this).attr('href');
  29. }).get();
  30. var properties = [];
  31. // var limit = 0;
  32. for (const link of propertyLinks) {
  33. // if(limit == 1) break;
  34. try {
  35. var response = await axios(link);
  36. var property = apartments.apartment(cheerio.load(response.data));
  37. property.url = link;
  38. properties.push(property);
  39. console.log(`${link} scraped.`);
  40. limit++;
  41. } catch (err) {
  42. console.error(`${link} scrape failed.`);
  43. }
  44. }
  45. }
  46. await collection.updateOne({ _id: _id }, { $set: { status: "done", result: properties, endDate: new Date() } });
  47. console.log(`${_id} scraped.`);
  48. return done();
  49. } catch (err) {
  50. console.log(err);
  51. }
  52. });
  53. (async function () {
  54. await agenda.start();
  55. })();
  56. // express application
  57. const app = express();
  58. app.use(express.json());
  59. app.use(cors());
  60. // database setup
  61. var mongoUrl = config.get("mongo");
  62. var database = config.get("database");
  63. var client = undefined;
  64. MongoClient.connect(mongoUrl, function (err, db) {
  65. if (err) throw err;
  66. console.log("Database created!");
  67. console.log(mongoUrl);
  68. client = db;
  69. var dbo = db.db(database);
  70. dbo.createCollection("scrapes", function (err, res) {
  71. if (err) {
  72. console.log("Collection already created!");
  73. return;
  74. }
  75. console.log("Collection created!");
  76. });
  77. });
  78. app.get("/scrapes", async (req, res) => {
  79. try {
  80. const dbo = client.db(database);
  81. let collection = dbo.collection('scrapes');
  82. let data = await collection.find({}).toArray();
  83. return res.json(data);
  84. } catch (err) {
  85. console.log(err);
  86. return res.status(500).json();
  87. }
  88. });
  89. app.get("/scrapes/:id", async (req, res) => {
  90. const id = req.params.id;
  91. try {
  92. const dbo = client.db(database);
  93. let collection = dbo.collection('scrapes');
  94. var o_id = new ObjectID(id);
  95. let data = await collection.findOne({ _id: o_id });
  96. return res.json(data);
  97. } catch (err) {
  98. console.log(err);
  99. res.status(500).json();
  100. }
  101. });
  102. app.post("/scrapes/", async (req, res) => {
  103. const location = req.body.location;
  104. const description = req.body.description;
  105. const price = req.body.price;
  106. const beds = req.body.beds;
  107. const type = req.body.type;
  108. const lifestyle = req.body.lifestyle;
  109. // query builder
  110. var query = `https://www.apartments.com`;
  111. if (type) {
  112. query += `/${type}`;
  113. }
  114. if (location) {
  115. var locationQuery = location.replace(", ", "-").replace(" ", "-").toLowerCase();
  116. query += `/${locationQuery}`;
  117. }
  118. if (beds) {
  119. query += `/${beds}-bedrooms`;
  120. }
  121. if (price) {
  122. if (beds) {
  123. query += `-over-${price}`;
  124. } else {
  125. query += `/over-${price}`;
  126. }
  127. }
  128. if (lifestyle) {
  129. query += `/${lifestyle}`;
  130. }
  131. console.log(query);
  132. const filterPage = await axios(query);
  133. const html = filterPage.data;
  134. const $ = cheerio.load(html);
  135. var $pageRange = $(".pageRange");
  136. var pagesCount = 0;
  137. var resultCount = 0;
  138. if (!$pageRange.length) {
  139. let propertyLinks = $('#placardContainer .property-link').map(function () {
  140. return $(this).attr('href');
  141. }).get();
  142. if (!propertyLinks.length) {
  143. console.error("No results");
  144. return res.status(404).json();
  145. }
  146. resultCount = propertyLinks.length;
  147. } else {
  148. pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3);
  149. resultCount = pagesCount * 25;
  150. }
  151. try {
  152. const dbo = client.db(database);
  153. let collection = dbo.collection('scrapes');
  154. const dt = new Date();
  155. dt.setSeconds(dt.getSeconds() + resultCount);
  156. let res = await collection.insertOne({
  157. count: resultCount,
  158. pageCount: pagesCount,
  159. estimate: dt,
  160. sourceUrl: query,
  161. location: location,
  162. description: description,
  163. filters: [
  164. { name: 'price', value: price },
  165. { name: 'beds', value: beds },
  166. { name: 'type', value: type },
  167. { name: 'lifestyle', value: lifestyle },
  168. ],
  169. status: "requested"
  170. });
  171. console.log(res);
  172. } catch (err) {
  173. console.log(err);
  174. return res.status(500).json();
  175. }
  176. return res.json();
  177. });
  178. app.patch("/scrapes/:id/execute", async (req, res) => {
  179. const id = req.params.id;
  180. try {
  181. const dbo = client.db(database);
  182. let collection = dbo.collection('scrapes');
  183. var o_id = new ObjectID(id);
  184. var newvalues = { $set: { status: "pending", startDate: new Date() } };
  185. await collection.updateOne({ _id: o_id }, newvalues);
  186. agenda.now('scrape', { _id: o_id });
  187. return res.status(204).json();
  188. } catch (err) {
  189. console.log(err);
  190. res.status(500).json();
  191. }
  192. });
  193. const port = 3333;
  194. app.listen(port, () => {
  195. console.log(`Example app listening at http://localhost:${port}`)
  196. });
  197. // Handles graceful stopping of jobs
  198. function graceful() {
  199. agenda.stop(function () {
  200. client.close(function (e) {
  201. if (e) logger.error(e);
  202. process.exit(0);
  203. });
  204. });
  205. }
  206. process.on('SIGTERM', graceful);
  207. process.on('SIGINT', graceful);