您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. const config = require('config');
  2. const axios = require("axios");
  3. const cheerio = require("cheerio");
  4. const express = require('express');
  5. const cors = require('cors');
  6. var stream = require('stream');
  7. var jsonxml = require('json2xml');
  8. var beautify = require('xml-beautifier');
  9. const MongoClient = require('mongodb').MongoClient;
  10. const ObjectID = require('mongodb').ObjectID;
  11. var apartments = require('./apartments.js');
  12. var houses = require('./houses.js');
  13. const distinct = (value, index, self) => {
  14. return self.indexOf(value) === index;
  15. }
  16. // jobs
  17. var mongoUrl = config.get("mongo");
  18. var agendaDb = config.get("agenda");
  19. const Agenda = require('agenda').Agenda;
  20. const agenda = new Agenda({ db: { address: agendaDb } });
  21. agenda.define('scrape', async function (job, done) {
  22. const { _id } = job.attrs.data;
  23. try {
  24. const dbo = client.db(database);
  25. let collection = dbo.collection('scrapes');
  26. let scrape = await collection.findOne({ _id: _id });
  27. // for (var page = 1; page <= scrape.pageCount; page++) {
  28. for (var page = 1; page <= 1; page++) {
  29. console.log("scrapping page " + page)
  30. const filterPage = await axios(scrape.sourceUrl + `/${page}`);
  31. const html = filterPage.data;
  32. const $ = cheerio.load(html);
  33. const propertyLinks = $('#placardContainer .property-link').map(function () {
  34. return $(this).attr('href');
  35. }).get();
  36. const links = propertyLinks.filter(distinct);
  37. var properties = [];
  38. // var limit = 0;
  39. for (const link of links) {
  40. // if(limit == 1) break;
  41. try {
  42. var response = await axios(link);
  43. var property = apartments.apartment(cheerio.load(response.data));
  44. property.url = link;
  45. properties.push(property);
  46. console.log(`${link} scraped.`)
  47. } catch (err) {
  48. console.error(`${link} scrape failed.`);
  49. }
  50. }
  51. }
  52. await collection.updateOne({ _id: _id }, { $set: { status: "done", result: properties, endDate: new Date() } });
  53. console.log(`${_id} scraped.`);
  54. return done();
  55. } catch (err) {
  56. console.log(err);
  57. }
  58. });
  59. (async function () {
  60. await agenda.start();
  61. })();
  62. // express application
  63. const app = express();
  64. app.use(express.json());
  65. app.use(cors());
  66. // database setup
  67. var mongoUrl = config.get("mongo");
  68. var database = config.get("database");
  69. var client = undefined;
  70. MongoClient.connect(mongoUrl, function (err, db) {
  71. if (err) throw err;
  72. console.log("Database created!");
  73. console.log(mongoUrl);
  74. client = db;
  75. var dbo = db.db(database);
  76. dbo.createCollection("scrapes", function (err, res) {
  77. if (err) {
  78. console.log("Collection already created!");
  79. return;
  80. }
  81. console.log("Collection created!");
  82. });
  83. });
  84. app.get("/", async (req, res) => {
  85. return res.json("ok");
  86. });
  87. app.get("/scrapes/:id/files/xml", async (req, res) => {
  88. const id = req.params.id;
  89. try {
  90. const dbo = client.db(database);
  91. let collection = dbo.collection('scrapes');
  92. var o_id = new ObjectID(id);
  93. let data = await collection.findOne({ _id: o_id });
  94. console.log("Data", data)
  95. var input = {
  96. ...data,
  97. _id: data._id.toString(),
  98. estimate: data.estimate?.toString(),
  99. createDate: data.estimate?.toString()
  100. };
  101. let xml = jsonxml(input, { header: true })
  102. xml.concat('<root>')
  103. // console.log("XML", xml)
  104. var formattedXml = beautify(xml);
  105. // console.log("formattedXml", formattedXml)
  106. var fileContents = Buffer.from(formattedXml);
  107. var readStream = new stream.PassThrough();
  108. readStream.end(fileContents);
  109. res.set('Content-disposition', 'attachment; filename=' + id + '.xml');
  110. res.set('Content-Type', 'text/plain');
  111. readStream.pipe(res);
  112. } catch (err) {
  113. console.log(err);
  114. res.status(500).json();
  115. }
  116. });
  117. app.get("/scrapes", async (req, res) => {
  118. try {
  119. const dbo = client.db(database);
  120. let collection = dbo.collection('scrapes');
  121. let data = await collection.find({}).toArray();
  122. return res.json(data);
  123. } catch (err) {
  124. console.log(err);
  125. return res.status(500).json();
  126. }
  127. });
  128. app.get("/scrapes/:id", async (req, res) => {
  129. const id = req.params.id;
  130. try {
  131. const dbo = client.db(database);
  132. let collection = dbo.collection('scrapes');
  133. var o_id = new ObjectID(id);
  134. let data = await collection.findOne({ _id: o_id });
  135. return res.json(data);
  136. } catch (err) {
  137. console.log(err);
  138. res.status(500).json();
  139. }
  140. });
  141. app.post("/scrapes/", async (req, res) => {
  142. const location = req.body.location;
  143. const description = req.body.description;
  144. const price = req.body.price;
  145. const beds = req.body.beds;
  146. const type = req.body.type;
  147. const lifestyle = req.body.lifestyle;
  148. const baths = req.body.baths;
  149. // query builder
  150. var query = `https://www.apartments.com`;
  151. if (type) {
  152. query += `/${type}`;
  153. }
  154. if (location) {
  155. var locationQuery = location.replace(", ", "-").replace(" ", "-").toLowerCase();
  156. query += `/${locationQuery}`;
  157. }
  158. if (beds) {
  159. query += `/${beds}-bedrooms`;
  160. }
  161. if (baths) {
  162. query += `${beds ? '-' : '/'}${baths}-bathrooms`;
  163. }
  164. if (price) {
  165. if (beds) {
  166. query += `-over-${price}`;
  167. } else {
  168. query += `/over-${price}`;
  169. }
  170. }
  171. if (lifestyle) {
  172. query += `/${lifestyle}`;
  173. }
  174. console.log(query);
  175. const filterPage = await axios(query);
  176. const html = filterPage.data;
  177. const $ = cheerio.load(html);
  178. var $pageRange = $(".pageRange");
  179. var pagesCount = 0;
  180. var resultCount = 0;
  181. if (!$pageRange.length) {
  182. let propertyLinks = $('#placardContainer .property-link').map(function () {
  183. return $(this).attr('href');
  184. }).get();
  185. if (!propertyLinks.length) {
  186. console.error("No results");
  187. return res.status(404).json();
  188. }
  189. resultCount = propertyLinks.length;
  190. } else {
  191. pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3);
  192. resultCount = pagesCount * 25;
  193. }
  194. try {
  195. const dbo = client.db(database);
  196. let collection = dbo.collection('scrapes');
  197. const dt = new Date();
  198. dt.setSeconds(dt.getSeconds() + resultCount);
  199. let res = await collection.insertOne({
  200. count: resultCount,
  201. pageCount: pagesCount,
  202. estimate: dt,
  203. createDate: new Date(),
  204. sourceUrl: query,
  205. location: location,
  206. description: description,
  207. filters: [
  208. { name: 'baths', value: baths },
  209. { name: 'price', value: price },
  210. { name: 'beds', value: beds },
  211. { name: 'type', value: type },
  212. { name: 'lifestyle', value: lifestyle },
  213. ],
  214. status: "requested"
  215. });
  216. console.log(res);
  217. } catch (err) {
  218. console.log(err);
  219. return res.status(500).json();
  220. }
  221. return res.json();
  222. });
  223. app.patch("/scrapes/:id/execute", async (req, res) => {
  224. const id = req.params.id;
  225. try {
  226. const dbo = client.db(database);
  227. let collection = dbo.collection('scrapes');
  228. var o_id = new ObjectID(id);
  229. var newvalues = { $set: { status: "pending", startDate: new Date() } };
  230. await collection.updateOne({ _id: o_id }, newvalues);
  231. agenda.now('scrape', { _id: o_id });
  232. return res.status(204).json();
  233. } catch (err) {
  234. console.log(err);
  235. res.status(500).json();
  236. }
  237. });
  238. const port = 5501;
  239. app.listen(port, () => {
  240. console.log(`Example app listening at http://localhost:${port}`)
  241. });
  242. // Handles graceful stopping of jobs
  243. function graceful() {
  244. agenda.stop(function () {
  245. client.close(function (e) {
  246. if (e) logger.error(e);
  247. process.exit(0);
  248. });
  249. });
  250. }
  251. process.on('SIGTERM', graceful);
  252. process.on('SIGINT', graceful);