您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. const config = require('config');
  2. const axios = require("axios");
  3. const cheerio = require("cheerio");
  4. const express = require('express');
  5. const cors = require('cors');
  6. const MongoClient = require('mongodb').MongoClient;
  7. const ObjectID = require('mongodb').ObjectID;
  8. var apartments = require('./apartments.js');
  9. var houses = require('./houses.js');
  10. const distinct = (value, index, self) => {
  11. return self.indexOf(value) === index;
  12. }
  13. // jobs
  14. var mongoUrl = config.get("mongo");
  15. var agendaDb = config.get("agenda");
  16. const Agenda = require('agenda').Agenda;
  17. const agenda = new Agenda({ db: { address: agendaDb } });
  18. agenda.define('scrape', async function (job, done) {
  19. const { _id } = job.attrs.data;
  20. try {
  21. const dbo = client.db(database);
  22. let collection = dbo.collection('scrapes');
  23. let scrape = await collection.findOne({ _id: _id });
  24. // for (var page = 1; page <= scrape.pageCount; page++) {
  25. for (var page = 1; page <= 1; page++) {
  26. console.log("scrapping page " + page)
  27. const filterPage = await axios(scrape.sourceUrl + `/${page}`);
  28. const html = filterPage.data;
  29. const $ = cheerio.load(html);
  30. const propertyLinks = $('#placardContainer .property-link').map(function () {
  31. return $(this).attr('href');
  32. }).get();
  33. const links = propertyLinks.filter(distinct);
  34. var properties = [];
  35. // var limit = 0;
  36. for (const link of links) {
  37. // if(limit == 1) break;
  38. try {
  39. var response = await axios(link);
  40. var property = apartments.apartment(cheerio.load(response.data));
  41. property.url = link;
  42. properties.push(property);
  43. console.log(`${link} scraped.`)
  44. } catch (err) {
  45. console.error(`${link} scrape failed.`);
  46. }
  47. }
  48. }
  49. await collection.updateOne({ _id: _id }, { $set: { status: "done", result: properties, endDate: new Date() } });
  50. console.log(`${_id} scraped.`);
  51. return done();
  52. } catch (err) {
  53. console.log(err);
  54. }
  55. });
  56. (async function () {
  57. await agenda.start();
  58. })();
  59. // express application
  60. const app = express();
  61. app.use(express.json());
  62. app.use(cors());
  63. // database setup
  64. var mongoUrl = config.get("mongo");
  65. var database = config.get("database");
  66. var client = undefined;
  67. MongoClient.connect(mongoUrl, function (err, db) {
  68. if (err) throw err;
  69. console.log("Database created!");
  70. console.log(mongoUrl);
  71. client = db;
  72. var dbo = db.db(database);
  73. dbo.createCollection("scrapes", function (err, res) {
  74. if (err) {
  75. console.log("Collection already created!");
  76. return;
  77. }
  78. console.log("Collection created!");
  79. });
  80. });
  81. app.get("/", async (req, res) => {
  82. return res.json("ok");
  83. });
  84. app.get("/scrapes", async (req, res) => {
  85. try {
  86. const dbo = client.db(database);
  87. let collection = dbo.collection('scrapes');
  88. let data = await collection.find({}).toArray();
  89. return res.json(data);
  90. } catch (err) {
  91. console.log(err);
  92. return res.status(500).json();
  93. }
  94. });
  95. app.get("/scrapes/:id", async (req, res) => {
  96. const id = req.params.id;
  97. try {
  98. const dbo = client.db(database);
  99. let collection = dbo.collection('scrapes');
  100. var o_id = new ObjectID(id);
  101. let data = await collection.findOne({ _id: o_id });
  102. return res.json(data);
  103. } catch (err) {
  104. console.log(err);
  105. res.status(500).json();
  106. }
  107. });
  108. app.post("/scrapes/", async (req, res) => {
  109. const location = req.body.location;
  110. const description = req.body.description;
  111. const price = req.body.price;
  112. const beds = req.body.beds;
  113. const type = req.body.type;
  114. const lifestyle = req.body.lifestyle;
  115. const baths = req.body.baths;
  116. // query builder
  117. var query = `https://www.apartments.com`;
  118. if (type) {
  119. query += `/${type}`;
  120. }
  121. if (location) {
  122. var locationQuery = location.replace(", ", "-").replace(" ", "-").toLowerCase();
  123. query += `/${locationQuery}`;
  124. }
  125. if (beds) {
  126. query += `/${beds}-bedrooms`;
  127. }
  128. if (baths) {
  129. query += `${beds ? '-' : '/'}${baths}-bathrooms`;
  130. }
  131. if (price) {
  132. if (beds) {
  133. query += `-over-${price}`;
  134. } else {
  135. query += `/over-${price}`;
  136. }
  137. }
  138. if (lifestyle) {
  139. query += `/${lifestyle}`;
  140. }
  141. console.log(query);
  142. const filterPage = await axios(query);
  143. const html = filterPage.data;
  144. const $ = cheerio.load(html);
  145. var $pageRange = $(".pageRange");
  146. var pagesCount = 0;
  147. var resultCount = 0;
  148. if (!$pageRange.length) {
  149. let propertyLinks = $('#placardContainer .property-link').map(function () {
  150. return $(this).attr('href');
  151. }).get();
  152. if (!propertyLinks.length) {
  153. console.error("No results");
  154. return res.status(404).json();
  155. }
  156. resultCount = propertyLinks.length;
  157. } else {
  158. pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3);
  159. resultCount = pagesCount * 25;
  160. }
  161. try {
  162. const dbo = client.db(database);
  163. let collection = dbo.collection('scrapes');
  164. const dt = new Date();
  165. dt.setSeconds(dt.getSeconds() + resultCount);
  166. let res = await collection.insertOne({
  167. count: resultCount,
  168. pageCount: pagesCount,
  169. estimate: dt,
  170. createDate: new Date(),
  171. sourceUrl: query,
  172. location: location,
  173. description: description,
  174. filters: [
  175. { name: 'baths', value: baths },
  176. { name: 'price', value: price },
  177. { name: 'beds', value: beds },
  178. { name: 'type', value: type },
  179. { name: 'lifestyle', value: lifestyle },
  180. ],
  181. status: "requested"
  182. });
  183. console.log(res);
  184. } catch (err) {
  185. console.log(err);
  186. return res.status(500).json();
  187. }
  188. return res.json();
  189. });
  190. app.patch("/scrapes/:id/execute", async (req, res) => {
  191. const id = req.params.id;
  192. try {
  193. const dbo = client.db(database);
  194. let collection = dbo.collection('scrapes');
  195. var o_id = new ObjectID(id);
  196. var newvalues = { $set: { status: "pending", startDate: new Date() } };
  197. await collection.updateOne({ _id: o_id }, newvalues);
  198. agenda.now('scrape', { _id: o_id });
  199. return res.status(204).json();
  200. } catch (err) {
  201. console.log(err);
  202. res.status(500).json();
  203. }
  204. });
  205. const port = 5501;
  206. app.listen(port, () => {
  207. console.log(`Example app listening at http://localhost:${port}`)
  208. });
  209. // Handles graceful stopping of jobs
  210. function graceful() {
  211. agenda.stop(function () {
  212. client.close(function (e) {
  213. if (e) logger.error(e);
  214. process.exit(0);
  215. });
  216. });
  217. }
  218. process.on('SIGTERM', graceful);
  219. process.on('SIGINT', graceful);