mirror of
https://github.com/zulip/zulip.git
synced 2025-11-22 07:21:23 +00:00
documentation_crawler: Add exclude list.
This works around the issue that Google calendar returns errors on HTTP HEAD requests.
This commit is contained in:
@@ -12,6 +12,13 @@ from scrapy.utils.url import url_has_any_extension
|
|||||||
|
|
||||||
from typing import Any, Generator, List, Optional, Tuple
|
from typing import Any, Generator, List, Optional, Tuple
|
||||||
|
|
||||||
|
EXCLUDED_URLS = [
|
||||||
|
# Google calendar returns 404s on HEAD requests unconditionally
|
||||||
|
'https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com',
|
||||||
|
# Returns 409 errors to HEAD requests frequently
|
||||||
|
'https://medium.freecodecamp.com',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class BaseDocumentationSpider(scrapy.Spider):
|
class BaseDocumentationSpider(scrapy.Spider):
|
||||||
name = None # type: Optional[str]
|
name = None # type: Optional[str]
|
||||||
@@ -82,10 +89,17 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||||||
request.dont_filter = True
|
request.dont_filter = True
|
||||||
yield request
|
yield request
|
||||||
|
|
||||||
|
def exclude_error(self, url):
|
||||||
|
if url in EXCLUDED_URLS:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def error_callback(self, failure):
|
def error_callback(self, failure):
|
||||||
# type: (Any) -> Optional[Generator[Any, None, None]]
|
# type: (Any) -> Optional[Generator[Any, None, None]]
|
||||||
if hasattr(failure.value, 'response') and failure.value.response:
|
if hasattr(failure.value, 'response') and failure.value.response:
|
||||||
response = failure.value.response
|
response = failure.value.response
|
||||||
|
if self.exclude_error(response.url):
|
||||||
|
return None
|
||||||
if response.status == 404:
|
if response.status == 404:
|
||||||
self._set_error_state()
|
self._set_error_state()
|
||||||
raise Exception('Page not found: {}'.format(response))
|
raise Exception('Page not found: {}'.format(response))
|
||||||
|
|||||||
Reference in New Issue
Block a user