Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

app.js 5.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. const config = require('config');
  2. const axios = require("axios");
  3. const cheerio = require("cheerio");
  4. const express = require('express');
  5. const cors = require('cors');
  6. const MongoClient = require('mongodb').MongoClient;
  7. const ObjectID = require('mongodb').ObjectID;
  8. var apartments = require('./apartments.js');
  9. var houses = require('./houses.js');
  10. // jobs
  11. var mongoUrl = config.get("mongo");
  12. var agendaDb = config.get("agenda");
  13. const Agenda = require('agenda').Agenda;
  14. const agenda = new Agenda({ db: { address: agendaDb } });
  15. agenda.define('scrape', async function (job, done) {
  16. const { _id } = job.attrs.data;
  17. try {
  18. const dbo = client.db(database);
  19. let collection = dbo.collection('scrapes');
  20. let scrape = await collection.findOne({ _id: _id });
  21. for (var page = 1; page <= scrape.pageCount; page++) {
  22. console.log("scrapping page " + page)
  23. const filterPage = await axios(scrape.sourceUrl + `/${page}`);
  24. const html = filterPage.data;
  25. const $ = cheerio.load(html);
  26. const propertyLinks = $('#placardContainer .property-link').map(function () {
  27. return $(this).attr('href');
  28. }).get();
  29. var properties = [];
  30. for (const link of propertyLinks) {
  31. try {
  32. var response = await axios(link);
  33. var property = apartments.apartment(cheerio.load(response.data));
  34. properties.push(property);
  35. console.log(`${link} scraped.`);
  36. }catch(err){
  37. console.error(`${link} scrape failed.`);
  38. }
  39. }
  40. }
  41. await collection.updateOne({ _id: _id }, { $set: { status: "done", result: properties } });
  42. console.log(`${_id} scraped.`);
  43. return done();
  44. } catch (err) {
  45. console.log(err);
  46. }
  47. });
  48. (async function () {
  49. await agenda.start();
  50. })();
  51. // express application
  52. const app = express();
  53. app.use(express.json());
  54. app.use(cors());
  55. // database setup
  56. var mongoUrl = config.get("mongo");
  57. var database = config.get("database");
  58. var client = undefined;
  59. MongoClient.connect(mongoUrl, function (err, db) {
  60. if (err) throw err;
  61. console.log("Database created!");
  62. console.log(mongoUrl);
  63. client = db;
  64. var dbo = db.db(database);
  65. dbo.createCollection("scrapes", function (err, res) {
  66. if (err) {
  67. console.log("Collection already created!");
  68. return;
  69. }
  70. console.log("Collection created!");
  71. });
  72. });
  73. app.get("/scrapes", async (req, res) => {
  74. try {
  75. const dbo = client.db(database);
  76. let collection = dbo.collection('scrapes');
  77. let data = await collection.find({}).toArray();
  78. return res.json(data);
  79. } catch (err) {
  80. console.log(err);
  81. return res.status(500).json();
  82. }
  83. });
  84. app.get("/scrapes/:id", async (req, res) => {
  85. const id = req.params.id;
  86. try {
  87. const dbo = client.db(database);
  88. let collection = dbo.collection('scrapes');
  89. var o_id = new ObjectID(id);
  90. let data = await collection.findOne({ _id: o_id });
  91. return res.json(data);
  92. } catch (err) {
  93. console.log(err);
  94. res.status(500).json();
  95. }
  96. });
  97. app.post("/scrapes/", async (req, res) => {
  98. const location = req.body.location;
  99. const price = req.body.price;
  100. const beds = req.body.beds;
  101. const type = req.body.type;
  102. const lifestyle = req.body.lifestyle;
  103. // query builder
  104. var query = `https://www.apartments.com`;
  105. if (type) {
  106. query += `/${type}`;
  107. }
  108. if (location) {
  109. var locationQuery = location.replace(", ", "-").replace(" ", "-").toLowerCase();
  110. query += `/${locationQuery}`;
  111. }
  112. if (beds) {
  113. query += `/${beds}-bedrooms`;
  114. }
  115. if (price) {
  116. if (beds) {
  117. query += `-over-${price}`;
  118. } else {
  119. query += `/over-${price}`;
  120. }
  121. }
  122. if (lifestyle) {
  123. query += `/${lifestyle}`;
  124. }
  125. console.log(query);
  126. const filterPage = await axios(query);
  127. const html = filterPage.data;
  128. const $ = cheerio.load(html);
  129. var $pageRange = $(".pageRange");
  130. var pagesCount = 0;
  131. var resultCount = 0;
  132. if (!$pageRange.length) {
  133. let propertyLinks = $('#placardContainer .property-link').map(function () {
  134. return $(this).attr('href');
  135. }).get();
  136. if (!propertyLinks.length) {
  137. console.error("No results");
  138. return res.status(404).json();
  139. }
  140. resultCount = propertyLinks.length;
  141. } else {
  142. pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3);
  143. resultCount = pagesCount * 25;
  144. }
  145. try {
  146. const dbo = client.db(database);
  147. let collection = dbo.collection('scrapes');
  148. const dt = new Date();
  149. dt.setSeconds( dt.getSeconds() + resultCount );
  150. let res = await collection.insertOne({
  151. count: resultCount,
  152. pageCount: pagesCount,
  153. estimate: dt,
  154. sourceUrl: query,
  155. location: location,
  156. filters: [
  157. { name: 'price', value: price },
  158. { name: 'beds', value: beds },
  159. { name: 'type', value: type },
  160. { name: 'lifestyle', value: lifestyle },
  161. ],
  162. status: "requested"
  163. });
  164. console.log(res);
  165. } catch (err) {
  166. console.log(err);
  167. return res.status(500).json();
  168. }
  169. return res.json();
  170. });
  171. app.patch("/scrapes/:id/execute", async (req, res) => {
  172. const id = req.params.id;
  173. try {
  174. const dbo = client.db(database);
  175. let collection = dbo.collection('scrapes');
  176. var o_id = new ObjectID(id);
  177. var newvalues = { $set: { status: "pending" } };
  178. await collection.updateOne({ _id: o_id }, newvalues);
  179. agenda.now('scrape', { _id: o_id });
  180. return res.status(204).json();
  181. } catch (err) {
  182. console.log(err);
  183. res.status(500).json();
  184. }
  185. });
  186. const port = 3333;
  187. app.listen(port, () => {
  188. console.log(`Example app listening at http://localhost:${port}`)
  189. });
  190. // Handles graceful stopping of jobs
  191. function graceful() {
  192. agenda.stop(function () {
  193. client.close(function (e) {
  194. if (e) logger.error(e);
  195. process.exit(0);
  196. });
  197. });
  198. }
  199. process.on('SIGTERM', graceful);
  200. process.on('SIGINT', graceful);