B °-_€"ã@s\dZddlZddlZddlZdgZe dd¡ZGdd„dƒZGdd„dƒZ Gd d „d ƒZ dS) a% robotparser.py Copyright (C) 2000 Bastian Kleineidam You can choose between two licenses when using this package: 1) GNU GPLv2 2) PSF license for Python 2.2 The robots.txt Exclusion Protocol is implemented as specified in http://www.robotstxt.org/norobots-rfc.txt éNÚRobotFileParserÚ RequestRatezrequests secondsc@sjeZdZdZddd„Zdd„Zdd„Zd d „Zd d „Zd d„Z dd„Z dd„Z dd„Z dd„Z dd„ZdS)rzs This class provides a set of methods to read, parse and answer questions about a single robots.txt file. ÚcCs,g|_d|_d|_d|_| |¡d|_dS)NFr)ÚentriesÚ default_entryÚ disallow_allÚ allow_allÚset_urlÚ last_checked)ÚselfÚurl©r ú(/usr/lib/python3.7/urllib/robotparser.pyÚ__init__s  zRobotFileParser.__init__cCs|jS)z·Returns the time the robots.txt file was last fetched. This is useful for long-running web spiders that need to check for new robots.txt files periodically. )r )r r r rÚmtime$szRobotFileParser.mtimecCsddl}| ¡|_dS)zYSets the time the robots.txt file was last fetched to the current time. rN)Útimer )r rr r rÚmodified-szRobotFileParser.modifiedcCs&||_tj |¡dd…\|_|_dS)z,Sets the URL referring to a robots.txt file.ééN)r ÚurllibÚparseÚurlparseZhostÚpath)r r r r rr 5szRobotFileParser.set_urlc Cs†ytj |j¡}WnRtjjk rd}z0|jdkr:d|_n|jdkrT|jdkrTd|_Wdd}~XYnX|  ¡}|  |  d¡  ¡¡dS)z4Reads the robots.txt URL and feeds it to the parser.)i‘i“TiiôNzutf-8) rZrequestZurlopenr ÚerrorZ HTTPErrorÚcoderrÚreadrÚdecodeÚ splitlines)r ÚfÚerrÚrawr r rr:s zRobotFileParser.readcCs,d|jkr|jdkr(||_n |j |¡dS)NÚ*)Ú useragentsrrÚappend)r Úentryr r rÚ _add_entryGs  zRobotFileParser._add_entrycCs6d}tƒ}| ¡x|D]þ}|sT|dkr8tƒ}d}n|dkrT| |¡tƒ}d}| d¡}|dkrr|d|…}| ¡}|s€q| dd¡}t|ƒdkr|d ¡ ¡|d<tj   |d ¡¡|d<|ddkr|dkrê| |¡tƒ}|j   |d¡d}q|ddkr4|dkr|j   t|dd ƒ¡d}q|dd krh|dkr|j   t|dd ƒ¡d}q|dd kr¦|dkr|d ¡ ¡r t|dƒ|_d}q|dd kr|dkr|d d¡}t|ƒdkr|d ¡ ¡r|d ¡ ¡rtt|dƒt|dƒƒ|_d}qW|dkr2| |¡dS)z”Parse the input lines from a robots.txt file. We allow that a user-agent: line is not preceded by one or more blank lines. rréú#Nú:z user-agentZdisallowFZallowTz crawl-delayz request-rateú/)ÚEntryrr%ÚfindÚstripÚsplitÚlenÚlowerrrÚunquoter"r#Ú rulelinesÚRuleLineÚisdigitÚintÚdelayrÚreq_rate)r ÚlinesÚstater$ÚlineÚiZnumbersr r rrPsd             zRobotFileParser.parsecCs |jr dS|jrdS|jsdStj tj |¡¡}tj dd|j|j |j |j f¡}tj  |¡}|sfd}x"|j D]}| |¡rn| |¡SqnW|jrœ|j |¡SdS)z=using the parsed robots.txt decide if useragent can fetch urlFTrr))rrr rrrr0Ú urlunparserZparamsZqueryZfragmentÚquoterÚ applies_toÚ allowancer)r Ú useragentr Z parsed_urlr$r r rÚ can_fetch“s$    zRobotFileParser.can_fetchcCs4| ¡s dSx|jD]}| |¡r|jSqW|jjS)N)rrr=r5r)r r?r$r r rÚ crawl_delay°s    zRobotFileParser.crawl_delaycCs4| ¡s dSx|jD]}| |¡r|jSqW|jjS)N)rrr=r6r)r r?r$r r rÚ request_rate¸s    zRobotFileParser.request_ratecCs0|j}|jdk r||jg}d tt|ƒ¡dS)NÚ )rrÚjoinÚmapÚstr)r rr r rÚ__str__Às  zRobotFileParser.__str__N)r)Ú__name__Ú __module__Ú __qualname__Ú__doc__rrrr rr%rr@rArBrGr r r rrs    Cc@s(eZdZdZdd„Zdd„Zdd„ZdS) r2zoA rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path.cCs<|dkr|sd}tj tj |¡¡}tj |¡|_||_dS)NrT)rrr;rr<rr>)r rr>r r rrÊs  zRuleLine.__init__cCs|jdkp| |j¡S)Nr!)rÚ startswith)r Úfilenamer r rr=ÒszRuleLine.applies_tocCs|jr dndd|jS)NZAllowZDisallowz: )r>r)r r r rrGÕszRuleLine.__str__N)rHrIrJrKrr=rGr r r rr2Çsr2c@s0eZdZdZdd„Zdd„Zdd„Zdd „Zd S) r*z?An entry has one or more user-agents and zero or more rulelinescCsg|_g|_d|_d|_dS)N)r"r1r5r6)r r r rrÛszEntry.__init__cCsg}x|jD]}| d|›¡q W|jdk r@| d|j›¡|jdk rj|j}| d|j›d|j›¡| tt|j ƒ¡| d¡d  |¡S)Nz User-agent: z Crawl-delay: zRequest-rate: r)rrC) r"r#r5r6ZrequestsZsecondsÚextendrErFr1rD)r ZretÚagentZrater r rrGás    z Entry.__str__cCsF| d¡d ¡}x.|jD]$}|dkr*dS| ¡}||krdSqWdS)z2check if this entry applies to the specified agentr)rr!TF)r-r/r")r r?rOr r rr=îs zEntry.applies_tocCs$x|jD]}| |¡r|jSqWdS)zZPreconditions: - our agent applies to this entry - filename is URL decodedT)r1r=r>)r rMr9r r rr>ûs   zEntry.allowanceN)rHrIrJrKrrGr=r>r r r rr*Ùs   r*) rKÚ collectionsZ urllib.parserZurllib.requestÚ__all__Ú namedtuplerrr2r*r r r rÚ s 2